Using Beautiful Soup#

Reading and parsing of web pages.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Data gathering
5from bs4 import BeautifulSoup

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Get HTML#

1with open(f"{data_path}/test.html", "r") as fd:
2    soup = BeautifulSoup(fd)
3    print(type(soup))
<class 'bs4.BeautifulSoup'>

Beautiful soup wrangling#

1print(soup.prettify())
<html>
 <body>
  <h1>
   Lorem ipsum dolor sit amet consectetuer adipiscing 
elit
  </h1>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa
   <strong>
    strong
   </strong>
   . Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede
   <a class="external ext" href="#">
    link
   </a>
   mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam ultricies nisi vel augue. 
Curabitur ullamcorper ultricies nisi.
  </p>
  <h1>
   Lorem ipsum dolor sit amet consectetuer adipiscing 
elit
  </h1>
  <h2>
   Aenean commodo ligula eget dolor aenean massa
  </h2>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa. 
Cum sociis natoque penatibus et magnis dis parturient 
montes, nascetur ridiculus mus. Donec quam felis, 
ultricies nec, pellentesque eu, pretium quis, sem.
  </p>
  <h2>
   Aenean commodo ligula eget dolor aenean massa
  </h2>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa. 
Cum sociis natoque penatibus et magnis dis parturient 
montes, nascetur ridiculus mus. Donec quam felis, 
ultricies nec, pellentesque eu, pretium quis, sem.
  </p>
  <ul>
   <li>
    <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">
     Information Entropy
    </a>
   </li>
   <li>
    <a href="http://www.gutenberg.org/browse/scores/top">
     Top books in Gutenberg
    </a>
   </li>
   <li>
    <a href="https://www.imdb.com/chart/top">
     Top 250 movies in IMDB
    </a>
   </li>
  </ul>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa. 
Cum sociis natoque penatibus et magnis dis parturient 
montes, nascetur ridiculus mus. Donec quam felis, 
ultricies nec, pellentesque eu, pretium quis, sem.
  </p>
  <form action="#" method="post">
   <fieldset>
    <label for="name">
     Name:
    </label>
    <input id="name" placeholder="Enter your 
full name" type="text"/>
    <label for="email">
     Email:
    </label>
    <input id="email" placeholder="Enter 
your email address" type="email"/>
    <label for="message">
     Message:
    </label>
    <textarea id="message" placeholder="What's on your 
mind?"></textarea>
    <input type="submit" value="Send message"/>
   </fieldset>
  </form>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa. 
Cum sociis natoque penatibus et magnis dis parturient 
montes, nascetur ridiculus mus. Donec quam felis, 
ultricies nec, pellentesque eu, pretium quis, sem.
  </p>
  <table class="data">
   <tr>
    <th>
     Entry Header 1
    </th>
    <th>
     Entry Header 2
    </th>
    <th>
     Entry Header 3
    </th>
    <th>
     Entry Header 4
    </th>
   </tr>
   <tr>
    <td>
     Entry First Line 1
    </td>
    <td>
     Entry First Line 2
    </td>
    <td>
     Entry First Line 3
    </td>
    <td>
     Entry First Line 4
    </td>
   </tr>
   <tr>
    <td>
     Entry Line 1
    </td>
    <td>
     Entry Line 2
    </td>
    <td>
     Entry Line 3
    </td>
    <td>
     Entry Line 4
    </td>
   </tr>
   <tr>
    <td>
     Entry Last Line 1
    </td>
    <td>
     Entry Last Line 2
    </td>
    <td>
     Entry Last Line 3
    </td>
    <td>
     Entry Last Line 4
    </td>
   </tr>
  </table>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa. 
Cum sociis natoque penatibus et magnis dis parturient 
montes, nascetur ridiculus mus. Donec quam felis, 
ultricies nec, pellentesque eu, pretium quis, sem.
  </p>
 </body>
</html>
1# Reading content from one <p> tag
2print(soup.p)
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa 
<strong>strong</strong>. Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede <a class="external ext" href="#">link</a> 
mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam ultricies nisi vel augue. 
Curabitur ullamcorper ultricies nisi.</p>
1# Using the findall method to find all paragraphs
2all_ps = soup.find_all("p")
3print("Total number of <p> --- {}".format(len(all_ps)))
Total number of <p> --- 6
1# The contents of a particular HTML tag (<table>)
2table = soup.table
3print(table.contents)
['\n', <tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>, '\n', <tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>, '\n', <tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>, '\n', <tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>, '\n']
1# Traversing the children of a particular node
2for child in table.children:
3    print(child)
4    print("*****")
*****
<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>
*****


*****
<tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>
*****


*****
1# Browsing all the possible children of a tag and not only the immediate one
2children = table.children
3descendants = table.descendants
4print(len(list(children)), len(list(descendants)))
9 61

Creating dataset#

1data = soup.findAll("tr")
2print("Data is a {} and {} items long".format(type(data), len(data)))
Data is a <class 'bs4.element.ResultSet'> and 4 items long
1# Checking the original table structure in the HTML source, the first
2# row is the column heading and the following rows are the data from
3# the HTML source. Using two different variables for the two sections:
4data_without_header = data[1:]
5headers = data[0]
6headers
<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>
1# List comprehensions to prepare header for the DataFrame
2columns = [th.getText() for th in headers.findAll("th")]
3columns
['Entry Header 1', 'Entry Header 2', 'Entry Header 3', 'Entry Header 4']
1# Data preparation requires a two-dimensional list, a list of lists
2data = [
3    [td.getText() for td in tr.findAll("td")] for tr in data_without_header
4]
5data
[['Entry First Line 1',
  'Entry First Line 2',
  'Entry First Line 3',
  'Entry First Line 4'],
 ['Entry Line 1', 'Entry Line 2', 'Entry Line 3', 'Entry Line 4'],
 ['Entry Last Line 1',
  'Entry Last Line 2',
  'Entry Last Line 3',
  'Entry Last Line 4']]
1# Creating the dataset
2dataset = pd.DataFrame(data, columns=columns)
3dataset.head()
Entry Header 1 Entry Header 2 Entry Header 3 Entry Header 4
0 Entry First Line 1 Entry First Line 2 Entry First Line 3 Entry First Line 4
1 Entry Line 1 Entry Line 2 Entry Line 3 Entry Line 4
2 Entry Last Line 1 Entry Last Line 2 Entry Last Line 3 Entry Last Line 4