Manipulating HTML Content¶

Let us understand how to manipulate HTML content leveraging APIs provided by BeautifulSoup.

decompose - to remove the tag along with the content.
unwrap - to remove the tag by retaining the content.
We can also change the properties of the tag, by assigning values to the generated dict type object.
We can also enclose existing content or tag into new tags.

html_str = """
<p>Some Text</p>
<table>
    <tbody>
        <tr>
            <th>Details</th>
            <th>URL</th>
        </tr>
        <tr>
            <td>Video Content</td>
            <td><a href="https://www.youtube.com/itversityin">YouTube Channel</a>
            </td>
        </tr>
        <tr>
            <td>Reference Material</td>
            <td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
            </td>
        </tr>
    </tbody>
</table>
"""

from IPython.core.display import HTML, display
display(HTML(html_str))

Some Text

Details	URL
Video Content	YouTube Channel
Reference Material	GitHub Repository

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_str, 'html.parser')
print(soup.prettify())

<p>
 Some Text
</p>
<table>
 <tbody>
  <tr>
   <th>
    Details
   </th>
   <th>
    URL
   </th>
  </tr>
  <tr>
   <td>
    Video Content
   </td>
   <td>
    <a href="https://www.youtube.com/itversityin">
     YouTube Channel
    </a>
   </td>
  </tr>
  <tr>
   <td>
    Reference Material
   </td>
   <td>
    <a href="https://www.github.com/dgadiraju/itversity-books">
     GitHub Repository
    </a>
   </td>
  </tr>
 </tbody>
</table>

Using decompose¶

p = soup.find('p')

p.decompose()

soup

<table>
<tbody>
<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td><a href="https://www.youtube.com/itversityin">YouTube Channel</a>
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

Using unwrap¶

a = soup.find('a')

<a href="https://www.youtube.com/itversityin">YouTube Channel</a>

a.unwrap()

<a href="https://www.youtube.com/itversityin"></a>

soup

<table>
<tbody>
<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

from IPython.core.display import display, HTML
display(HTML(str(soup)))

Details	URL
Video Content	YouTube Channel
Reference Material	GitHub Repository

Updating Tag Attribute¶

for tag in soup.find_all('tr'):
    print(tag)

<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>

for tag in soup.find_all('tr'):
    print(tag['class'])

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-14-0007006650f9> in <module>
      1 for tag in soup.find_all('tr'):
----> 2     print(tag['class'])

/opt/anaconda3/envs/beakerx/lib/python3.6/site-packages/bs4/element.py in __getitem__(self, key)
   1404         """tag[key] returns the value of the 'key' attribute for the Tag,
   1405         and throws an exception if it's not there."""
-> 1406         return self.attrs[key]
   1407 
   1408     def __iter__(self):

KeyError: 'class'

for tag in soup.find_all('tr'):
    tag['class'] = 'special'

for tag in soup.find_all('tr'):
    print(tag['class'])

special
special
special

soup

<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr class="special">
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

Wrapping Text¶

strong = soup.new_tag('strong')

strong

<strong></strong>

type(strong)

bs4.element.Tag

td = soup.find('td')

td.text

'Video Content'

strong.insert(0, td.text)

strong

<strong>Video Content</strong>

td.string = ''

td

<td></td>

td.insert(0, strong)

soup

<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td><strong>Video Content</strong></td>
<td>YouTube Channel
</td>
</tr>
<tr class="special">
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

for tag in soup.find_all('td'):
    if not tag.find('a'):
        strong = soup.new_tag('strong')
        strong.insert(0, tag.text)
        tag.string = ''
        tag.insert(0, strong)

soup

<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td><strong>Video Content</strong></td>
<td><strong>YouTube Channel
</strong></td>
</tr>
<tr class="special">
<td><strong>Reference Material</strong></td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

from IPython.core.display import HTML, display
display(HTML(str(soup)))

Details	URL
Video Content	YouTube Channel
Reference Material	GitHub Repository

Parsing HTML using BeautifulSoup Scraping Website Content