Manipulating HTML Content

Let us understand how to manipulate HTML content leveraging APIs provided by BeautifulSoup.

  • decompose - to remove the tag along with the content.

  • unwrap - to remove the tag by retaining the content.

  • We can also change the properties of the tag, by assigning values to the generated dict type object.

  • We can also enclose existing content or tag into new tags.

html_str = """
<p>Some Text</p>
<table>
    <tbody>
        <tr>
            <th>Details</th>
            <th>URL</th>
        </tr>
        <tr>
            <td>Video Content</td>
            <td><a href="https://www.youtube.com/itversityin">YouTube Channel</a>
            </td>
        </tr>
        <tr>
            <td>Reference Material</td>
            <td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
            </td>
        </tr>
    </tbody>
</table>
"""
from IPython.core.display import HTML, display
display(HTML(html_str))

Some Text

Details URL
Video Content YouTube Channel
Reference Material GitHub Repository
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_str, 'html.parser')
print(soup.prettify())
<p>
 Some Text
</p>
<table>
 <tbody>
  <tr>
   <th>
    Details
   </th>
   <th>
    URL
   </th>
  </tr>
  <tr>
   <td>
    Video Content
   </td>
   <td>
    <a href="https://www.youtube.com/itversityin">
     YouTube Channel
    </a>
   </td>
  </tr>
  <tr>
   <td>
    Reference Material
   </td>
   <td>
    <a href="https://www.github.com/dgadiraju/itversity-books">
     GitHub Repository
    </a>
   </td>
  </tr>
 </tbody>
</table>

Using decompose

p = soup.find('p')
p.decompose()
soup
<table>
<tbody>
<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td><a href="https://www.youtube.com/itversityin">YouTube Channel</a>
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

Using unwrap

a = soup.find('a')
a
<a href="https://www.youtube.com/itversityin">YouTube Channel</a>
a.unwrap()
<a href="https://www.youtube.com/itversityin"></a>
soup
<table>
<tbody>
<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>
from IPython.core.display import display, HTML
display(HTML(str(soup)))
Details URL
Video Content YouTube Channel
Reference Material GitHub Repository

Updating Tag Attribute

for tag in soup.find_all('tr'):
    print(tag)
<tr>
<th>Details</th>
<th>URL</th>
</tr>
<tr>
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr>
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
for tag in soup.find_all('tr'):
    print(tag['class'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-14-0007006650f9> in <module>
      1 for tag in soup.find_all('tr'):
----> 2     print(tag['class'])

/opt/anaconda3/envs/beakerx/lib/python3.6/site-packages/bs4/element.py in __getitem__(self, key)
   1404         """tag[key] returns the value of the 'key' attribute for the Tag,
   1405         and throws an exception if it's not there."""
-> 1406         return self.attrs[key]
   1407 
   1408     def __iter__(self):

KeyError: 'class'
for tag in soup.find_all('tr'):
    tag['class'] = 'special'
for tag in soup.find_all('tr'):
    print(tag['class'])
special
special
special
soup
<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td>Video Content</td>
<td>YouTube Channel
</td>
</tr>
<tr class="special">
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>

Wrapping Text

strong = soup.new_tag('strong')
strong
<strong></strong>
type(strong)
bs4.element.Tag
td = soup.find('td')
td.text
'Video Content'
strong.insert(0, td.text)
strong
<strong>Video Content</strong>
td.string = ''
td
<td></td>
td.insert(0, strong)
soup
<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td><strong>Video Content</strong></td>
<td>YouTube Channel
</td>
</tr>
<tr class="special">
<td>Reference Material</td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>
for tag in soup.find_all('td'):
    if not tag.find('a'):
        strong = soup.new_tag('strong')
        strong.insert(0, tag.text)
        tag.string = ''
        tag.insert(0, strong)
soup
<table>
<tbody>
<tr class="special">
<th>Details</th>
<th>URL</th>
</tr>
<tr class="special">
<td><strong>Video Content</strong></td>
<td><strong>YouTube Channel
</strong></td>
</tr>
<tr class="special">
<td><strong>Reference Material</strong></td>
<td><a href="https://www.github.com/dgadiraju/itversity-books">GitHub Repository</a>
</td>
</tr>
</tbody>
</table>
from IPython.core.display import HTML, display
display(HTML(str(soup)))
Details URL
Video Content YouTube Channel
Reference Material GitHub Repository