Extracting Content from Website¶
Let us read the content from the website and get the length of the content using BeautifulSoup
and Python pure collections.
First, we will explore how to read the main content for single page.
We will then create list for all the urls from which we want to extract the content.
import requests
page_url = 'https://python.itversity.com/04_postgres_database_operations/04_ddl_data_definition_language.html'
page = requests.get(page_url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
This will read the content from the entire page. However, we are interested in the main content at the center.
soup.get_text()
Main content at the center is under div tag with id
main-content
. We can find for that tag and useget_text
to extract the main content from a single page.
soup.find('div', id='main-content').get_text()
Now let us get the text content for all the pages.
Get all the urls that need to be scraped in a list.
For each url, extract the content and add to a list along with the url.
We should get the content as well as url in the new list.
import requests
python_base_url = 'https://python.itversity.com'
python_url = f'{python_base_url}/mastering-python.html'
python_page = requests.get(python_url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(python_page.content, 'html.parser')
nav = soup.find('nav', {'id': 'bd-docs-nav'})
first_level_urls = []
for a in nav.find_all('a', {'class': 'reference internal'}):
if a['href'] != '#':
first_level_urls.append(a['href'])
all_urls = []
for first_level_url in first_level_urls:
url = f"{python_base_url}/{first_level_url}"
all_urls.append(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
current_nav = soup.find('nav', {'id': 'bd-docs-nav'})
current_href = current_nav.find('li', {'class': 'toctree-l1 current active'})
for second_level_href in current_href.find_all('a', {'class': 'reference internal'}):
all_urls.append(f"{'/'.join(url.split('/')[:-1])}/{second_level_href['href']}")
%%time
url_and_content_list = []
for content_url in all_urls:
content_page = requests.get(content_url)
content_soup = BeautifulSoup(content_page.content, 'html.parser')
content_text = content_soup.find('div', id='main-content').get_text()
url_and_content_list.append((content_url, content_text))
for url in url_and_content_list[:10]:
print(f'{url[0]} : {len(url[1])}')