Extracting Content from Website

Let us read the content from the website and get the length of the content using BeautifulSoup and Python pure collections.

  • First, we will explore how to read the main content for single page.

  • We will then create list for all the urls from which we want to extract the content.

import requests

page_url = 'https://python.itversity.com/04_postgres_database_operations/04_ddl_data_definition_language.html'
page = requests.get(page_url)

from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')
  • This will read the content from the entire page. However, we are interested in the main content at the center.

soup.get_text()
  • Main content at the center is under div tag with id main-content. We can find for that tag and use get_text to extract the main content from a single page.

soup.find('div', id='main-content').get_text()
  • Now let us get the text content for all the pages.

    • Get all the urls that need to be scraped in a list.

    • For each url, extract the content and add to a list along with the url.

  • We should get the content as well as url in the new list.

import requests

python_base_url = 'https://python.itversity.com'
python_url = f'{python_base_url}/mastering-python.html'
python_page = requests.get(python_url)

from bs4 import BeautifulSoup

soup = BeautifulSoup(python_page.content, 'html.parser')
nav = soup.find('nav', {'id': 'bd-docs-nav'})
first_level_urls = []
for a in nav.find_all('a', {'class': 'reference internal'}):
    if a['href'] != '#':
        first_level_urls.append(a['href'])
all_urls = []
for first_level_url in first_level_urls:
    url = f"{python_base_url}/{first_level_url}"
    all_urls.append(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    current_nav = soup.find('nav', {'id': 'bd-docs-nav'})
    current_href = current_nav.find('li', {'class': 'toctree-l1 current active'})
    for second_level_href in current_href.find_all('a', {'class': 'reference internal'}):
        all_urls.append(f"{'/'.join(url.split('/')[:-1])}/{second_level_href['href']}")
%%time
url_and_content_list = []
for content_url in all_urls:
    content_page = requests.get(content_url)
    content_soup = BeautifulSoup(content_page.content, 'html.parser')
    content_text = content_soup.find('div', id='main-content').get_text()
    url_and_content_list.append((content_url, content_text))
for url in url_and_content_list[:10]:
    print(f'{url[0]} : {len(url[1])}')