Lecture 7 - capturing web content

Fetching Web Pages

>>> import urllib.request
>>> page = urllib.request.urlopen("https://www.boredapi.com/api/activity")
>>> page.read()
b'{"activity":"Learn the NATO phonetic alphabet","type":"education","participants":1,"price":0,"link":"https://en.wikipedia.org/wiki/NATO_phonetic_alphabet","key":"6706598","accessibility":0}'

page.read() returns the page source

to get the page URL, use:

1 2	>>> page.geturl() 'https://www.boredapi.com/api/activity'

to get the HTTP response code use:

1 2	>>> page.getcode() 200

We haven't yet covered the exceptions, hence if you like to validate if page exists, you can use following function (example of usage in midterm program prep):

def page_exists(page):
	try:
		urllib.request.urlopen(page)
		return True
	except:
		return False

BUILDING a WEB CRAWLER

Let's write a crawler that crawls UNH web site.

We will start by hitting the home page and looking at it's content:

import urllib.request

seed = "http://www.newhaven.edu/"


page = urllib.request.urlopen(seed)
content=page.read()
code=page.getcode()

print(code, " : ", page.geturl(), ":", content)

The response we get back is quite lengthy, so if you want to store the response to a file you can do:

1	python crawler.py >> crawler_out.txt

at this time we will only be interested in following data:
<title>Home - University of New Haven</title>
<meta name="Keywords" content="University of New Haven" />

So let's try to extract that data using RE,
Extracting a title (note, that the page.read() returns a byte content and we need to convert it into a string)

import urllib.request
import re

seed = "http://www.newhaven.edu/"

page = urllib.request.urlopen(seed)
content=page.read()
content_string = content.decode("utf-8")

regexp_title = re.compile('<title>(?P<title>(.*))</title>')

result = regexp_title.search(content_string, re.IGNORECASE)
if result:
	title = result.group("title")
	print(title)

1 2	$ python crawler.py Home - University of New Haven

Now extract title and keywords:

import urllib.request
import re

seed = "http://www.newhaven.edu/"


page = urllib.request.urlopen(seed)
content=page.read()
content_string = content.decode("utf-8")

regexp_title = re.compile('<title>(?P<title>(.*))</title>')
regexp_keywords = re.compile('<meta name="Keywords" content="(?P<keywords>(.*))" />')

result = regexp_title.search(content_string, re.IGNORECASE)

if result:
	title = result.group("title")
	print(title)

result = regexp_keywords.search(content_string, re.IGNORECASE)

if result:
	keywords = result.group("keywords")
	print(keywords)

1
2
3

$ python crawler.py
Home - University of New Haven
University of New Haven

Now, let's build a simple crawler that will crawl the UNH web pages extracting the title and meta keywords to make them searchable later on.

We first need to extract all URL's on the seed page:

import urllib.request
import re

seed = "http://www.newhaven.edu/"

page = urllib.request.urlopen(seed)
content=page.read()
content_string = content.decode("utf-8")

regexp_title = re.compile('<title>(?P<title>(.*))</title>')
regexp_keywords = re.compile('<meta name="[kK]eywords" content="(?P<keywords>(.*))" />')
regexp_url = re.compile("www.newhaven.edu[/\w+]*")

result = regexp_title.search(content_string, re.IGNORECASE)

if result:
	title = result.group("title")
	print(title)

result = regexp_keywords.search(content_string, re.IGNORECASE)

if result:
	keywords = result.group("keywords")
	print(keywords)

for (urls) in re.findall(regexp_url, content_string):
	print(urls)

$ python crawler.py
Home - University of New Haven
University of New Haven
www.newhaven.edu/_resources/images/hero/maxcy
www.newhaven.edu/_resources/images/hero/maxcy
www.newhaven.edu/_resources/images/favicons/favicon
www.newhaven.edu/about/facilities/index
www.newhaven.edu/_resources/images/centennial/cropped

Now we need to visit all URL's and extract title and the meta tags out of them.
Keep in mind that many of these URL's will have the same nested URLs within them, and we don't want to re-visit them.

import urllib.request
from urllib.error import  URLError
import re


def visit_url(url, domain):
	global crawler_backlog
	if(len(crawler_backlog)>100):
		return
	if(url in crawler_backlog and crawler_backlog[url] == 1):
		return
	else:
		crawler_backlog[url] = 1
		print("Processing:", url)
	try:
		page = urllib.request.urlopen(url)
		code=page.getcode()
		if(code == 200):
			content=page.read()
			content_string = content.decode("utf-8")
			regexp_title = re.compile('<title>(?P<title>(.*))</title>')
			regexp_keywords = re.compile('<meta name="keywords" content="(?P<keywords>(.*))" />')
			regexp_url = re.compile("https?://\w*"+domain+"[/\w+]*")

			result = regexp_title.search(content_string, re.IGNORECASE)

			if result:
				title = result.group("title")
				print(title)

			result = regexp_keywords.search(content_string, re.IGNORECASE)

			if result:
				keywords = result.group("keywords")
				print(keywords)

			for (urls) in re.findall(regexp_url, content_string):
					if(urls  not in crawler_backlog or crawler_backlog[urls] != 1):
						crawler_backlog[urls] = 0
						visit_url(urls, domain)
	except URLError as e:
		print("error")

crawler_backlog = {}

seed = "http://www.newhaven.edu/"

crawler_backlog[seed]=0

visit_url(seed, "\.newhaven\.edu")

BUILDING A LOAD TEST

What is load test?
(discussion)

We will be building a load test to test GET services for list of URL's in a list
We should be able to define number of concurrent users hitting the services
https://cat-fact.herokuapp.com/facts

Let's start:

First, we need to be able to access the URL, so let's write a code that will do just that:

import urllib.request

#this is the function to run a single test
def one_test(url):
	page = urllib.request.urlopen(url)
	res = page.read()
	print(res) #we can delete this print statement later

one_test('https://cat-fact.herokuapp.com/facts')

$ python load_test.py 
b'{"all":[{"_id":"58e0088b0aac31001185ed09","text":"The world\'s largest cat measured 48.5 inches long.","type":"cat","user":{"_id":"58e007480aac31001185ecef","name":{"first":"Kasimir","last":"Schulz"}},"upvotes":7,"userUpvoted":null},

Now add timer to it

import urllib.request
import time

#this is the function to run a single test
def one_test(url):
	start_time = time.monotonic()*1000
	page = urllib.request.urlopen(url)
	page.read()
	end_time = time.monotonic()*1000
	return end_time - start_time

print(one_test('https://cat-fact.herokuapp.com/facts'))

$ python load_test.py 
416.97022300000003

We will now move the URL into a list and add the other URLs

import urllib.request
import time

url_list = [
'https://cat-fact.herokuapp.com/facts', 
'https://cat-fact.herokuapp.com/facts/random',
'https://cat-fact.herokuapp.com/facts?animal_type=cat,horse']

#this is the function to run a single test
def one_test(url):
	start_time = time.monotonic()*1000
	page = urllib.request.urlopen(url)
	page.read()
	end_time = time.monotonic()*1000
	return end_time - start_time

for url in url_list:
	print(one_test(url))

$ python load_test.py 
395.063674
181.19762100000008
366.08377800000005

Adding concurrency

import urllib.request
import time
import threading

url_list = [
'https://cat-fact.herokuapp.com/facts', 
'https://cat-fact.herokuapp.com/facts/random',
'https://cat-fact.herokuapp.com/facts?animal_type=cat,horse']

#this is the function to run a single test
def one_test(url):
	start_time = time.monotonic()*1000
	page = urllib.request.urlopen(url)
	page.read()
	end_time = time.monotonic()*1000
	return end_time - start_time

def one_user():
	for url in url_list:
		print(one_test(url))

for k in range(3):
    t = threading.Thread(target=one_user)
    t.start()

$ python load_test.py 
583.726101
671.282777
684.9828620000001
189.13146899999992
165.072446
192.9520010000001
385.1676630000002
473.7457770000001
532.925066

And finally adding think time

import urllib.request
import time
import threading

url_list = [
'https://cat-fact.herokuapp.com/facts', 
'https://cat-fact.herokuapp.com/facts/random',
'https://cat-fact.herokuapp.com/facts?animal_type=cat,horse']

#this is the function to run a single test
def one_test(url):
	start_time = time.monotonic()*1000
	page = urllib.request.urlopen(url)
	page.read()
	end_time = time.monotonic()*1000
	return end_time - start_time

def one_user():
	for url in url_list:
		print(one_test(url))
		time.sleep (1); #number of seconds for the execution to suspend
		print(threading.current_thread(), " is sleeping") 

for k in range(3):
    t = threading.Thread(target=one_user)
    t.start()

$ python load_test.py 
585.4337079999999
761.8894909999999
764.2173230000001
<Thread(Thread-1, started 123145447641088)>  is sleeping
<Thread(Thread-2, started 123145452896256)>  is sleeping
<Thread(Thread-3, started 123145458151424)>  is sleeping
511.39206000000036
640.5732069999999
642.4653329999994
<Thread(Thread-1, started 123145447641088)>  is sleeping
<Thread(Thread-2, started 123145452896256)>  is sleeping
<Thread(Thread-3, started 123145458151424)>  is sleeping
682.5636030000001
746.524218
764.053617
<Thread(Thread-1, started 123145447641088)>  is sleeping
<Thread(Thread-3, started 123145458151424)>  is sleeping
<Thread(Thread-2, started 123145452896256)>  is sleeping

Fetching Web Pages

BUILDING a WEB CRAWLER

​BUILDING A LOAD TEST

BUILDING A LOAD TEST