Web scraping with authorization

Scraping data from web pages that requires login and password represents addional challange to stay authorized during the process. This is where requests.session() is extremely handy

Objective for this project was to collect data from web site with huge price list of prudcts for health. While some information is available without authorization, some important data is revealed only if you stay connected after login.

Python's request does all heavy job of connecting, loging and keep session alive while BeautifulSoup libraries helps to collect data for navigation and scraping.

Scraping data

Pythons' requests

First step before loging is to get token value from hidden input of the form. Then you need to provide headers and cookies data for requests.session(). And do not forget to provide data with login and password.

Once you get connected, you need to collect navigation data from the site, and then follow the links to scrape data as needed from every page.

# import libraries
import requests
import time
from bs4 import BeautifulSoup

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Host': 'www.fab-ent.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15',
    'Accept-Language': 'en-gb',
    'Referer': 'https://www.fab-ent.com/',
    'Connection': 'keep-alive',
}

login_url = 'https://www.fab-ent.com/manage-account/'

# get token
response = requests.get(login_url, headers=headers)
print('connecting to login page to get token..')
time.sleep(10)
soup = BeautifulSoup(response.content, 'html.parser')
try:
  token = soup.find('input', {'id':'_wpnonce'})['value']
  print(f'found token: {token}')
except:
  print('token not found.. exit')
  exit()

# login

cookies = {
    'PHPSESSID': '7pq0ohu5v4gvgf38p9umfirvot',
    '_ga': 'GA1.2.923748210.1611468192',
    '_gid': 'GA1.2.1435257517.1611468192',
    '_fbp': 'fb.1.1611468192145.947029952',
    '_mkto_trk': 'id:672-BAK-352&token:_mch-fab-ent.com-1611468193546-81679',
    'wordpress_test_cookie': 'WP+Cookie+check',
    '_gat': '1',
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'Origin': 'https://www.fab-ent.com',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document',
    'Referer': 'https://www.fab-ent.com/manage-account/',
    'Accept-Language': 'en-US,en;q=0.9',
}

data = {
  'username': 'foo',
  'password': 'bar',
  '_wpnonce': token,
  '_wp_http_referer': '/manage-account/',
  'login': 'Login'
}

# start the session
s = requests.Session()

# post to page to login
s.post('https://www.fab-ent.com/manage-account/', headers=headers, cookies=cookies, data=data)
print('trying to login..')
time.sleep(10)

# check if login succesfull
print('check login success..')
response = s.get('https://www.fab-ent.com/manage-account/', headers=headers)
time.sleep(10)
soup = BeautifulSoup(response.content, 'html.parser')
print(f"logged-in as {soup.find_all('a', {'class':'aboveHeaderLink'})[1].text}")

# collect categories
side_bar = soup.find('aside', {'id' : 'catSidebar'})
categories = [link.text for link in side_bar.find_all('a')]
categories_links = [link['href'] for link in side_bar.find_all('a')]
print(f'found {len(categories)} categories:')
print(categories)

base_url = 'https://www.fab-ent.com'

# collect subcategories
for cat_num in range(len(categories)):
  url = base_url + categories_links[cat_num]
  print(f'collecting subcategories from {url}..')
  response = s.get(url, headers=headers)
  time.sleep(10)
  soup = BeautifulSoup(response.content, 'html.parser')
  side_bar = soup.find('aside', {'id' : 'catSidebar'})
  # collect subcategories
  sub_categories = [link.text for link in side_bar.find('a', {'class':'more'}).next_sibling.find_all('a')]
  sub_categories_links = [link['href'] for link in side_bar.find('a', {'class':'more'}).next_sibling.find_all('a')]
  print(f'found {len(sub_categories)} subcategories:')
  print(sub_categories)
  
  # collect list of products from subcategory
  for sub_cat_num in range(len(sub_categories)):
    url = base_url + sub_categories_links[sub_cat_num]
    print(f'collecting list of products from {url}..')
    response = s.get(url, headers=headers)
    time.sleep(10)
    soup = BeautifulSoup(response.content, 'html.parser')
    cards = soup.find('ul', {'class':'products'})
    products = [product.find('a').text for product in cards.find_all('h3')]
    products_links = [link.find('a')['href'] for link in cards.find_all('h3')]
    print(f'found {len(products)} products:')
    print(products)
    
    # collect SKUs(items) from product page
    for prod_num in range(len(products)):
      url = base_url + products_links[prod_num]
      print(f'collecting SKUs (items) from {url}..')
      response = s.get(url, headers=headers)
      time.sleep(10)
      soup = BeautifulSoup(response.content, 'html.parser')
      
      # prepare list to store results
      results = []

      # collect table rows from page
      table = soup.find('table').find_all('tr')
      for row in range(len(table)):
        # check if row is a header
        if 'th' in [ tag.name for tag in table[row] ]:
          continue
        if len([tag.name for tag in table[row]]) > 3:
          # prepare dict for item    
          item = {}
          # collect data from cells in each row
          item_row = table[row].find_all('td')
          item['item_id'] = item_row[0].text
          item['item_desc'] = item_row[1].text          
          item['item_price'] = item_row[2].text
          # check if price2 exists (not exists if not logged-in)
          try:
            item['item_price2'] = item_row[3].text
          except:
            item['item_price2'] = ''
          
        if len([tag.name for tag in table[row]]) < 4:
          item['dimensions'] = ''
          item['weight'] = ''
          item['upc'] = ''
          for tag in table[row].find_all('strong'):
            if tag.text == 'Dimensions:':
              item['dimensions'] = tag.next_sibling
            if tag.text == 'Weight:':
              item['weight'] = tag.next_sibling
            if tag.text == 'UPC:':
              item['upc'] = tag.next_sibling
              
            # add item to list
            results.append(item)

# save data to DataFrame
with open('product_test.csv', 'w') as file:
  writer = csv.DictWriter(file, results[0].keys())
  writer.writeheader()
  writer.writerows(results)

Finall step is to clean data with pandas and export to CSV format.

Data cleaning and export to CSV

Thank you for viewing my project. Any comment, suggestion, please adress to me by email bellow.

Web scrapingAuthorization

Pythons' requests

Finall step is to clean data with pandas and export to CSV format.