{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Manipulating HTML Content\n", "\n", "Let us understand how to manipulate HTML content leveraging APIs provided by BeautifulSoup.\n", "\n", "* `decompose` - to remove the tag along with the content.\n", "* `unwrap` - to remove the tag by retaining the content.\n", "* We can also change the properties of the tag, by assigning values to the generated dict type object.\n", "* We can also enclose existing content or tag into new tags." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "remove-cell" ] }, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%HTML\n", "" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "html_str = \"\"\"\n", "

Some Text

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "

Some Text

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.core.display import HTML, display\n", "display(HTML(html_str))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "

\n", " Some Text\n", "

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", " Details\n", " \n", " URL\n", "
\n", " Video Content\n", " \n", " \n", " YouTube Channel\n", " \n", "
\n", " Reference Material\n", " \n", " \n", " GitHub Repository\n", " \n", "
\n", "\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "soup = BeautifulSoup(html_str, 'html.parser')\n", "print(soup.prettify())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using decompose" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "p = soup.find('p')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "p.decompose()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using unwrap" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "a = soup.find('a')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "YouTube Channel" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.unwrap()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.core.display import display, HTML\n", "display(HTML(str(soup)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Updating Tag Attribute" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Details\n", "URL\n", "\n", "\n", "Video Content\n", "YouTube Channel\n", "\n", "\n", "\n", "Reference Material\n", "GitHub Repository\n", "\n", "\n" ] } ], "source": [ "for tag in soup.find_all('tr'):\n", " print(tag)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'class'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtag\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'tr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'class'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/opt/anaconda3/envs/beakerx/lib/python3.6/site-packages/bs4/element.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \"\"\"tag[key] returns the value of the 'key' attribute for the Tag,\n\u001b[1;32m 1405\u001b[0m and throws an exception if it's not there.\"\"\"\n\u001b[0;32m-> 1406\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1407\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1408\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'class'" ] } ], "source": [ "for tag in soup.find_all('tr'):\n", " print(tag['class'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for tag in soup.find_all('tr'):\n", " tag['class'] = 'special'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "special\n", "special\n", "special\n" ] } ], "source": [ "for tag in soup.find_all('tr'):\n", " print(tag['class'])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Wrapping Text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "strong = soup.new_tag('strong')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strong" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bs4.element.Tag" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(strong)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "td = soup.find('td')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Video Content'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "td.text" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "strong.insert(0, td.text)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Video Content" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strong" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "td.string = ''" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "td" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "td.insert(0, strong)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "for tag in soup.find_all('td'):\n", " if not tag.find('a'):\n", " strong = soup.new_tag('strong')\n", " strong.insert(0, tag.text)\n", " tag.string = ''\n", " tag.insert(0, strong)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
DetailsURL
Video ContentYouTube Channel\n", "
Reference MaterialGitHub Repository\n", "
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.core.display import HTML, display\n", "display(HTML(str(soup)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.12" } }, "nbformat": 4, "nbformat_minor": 4 }