BeautifulSoup and href elements
Date: February 16th 2018
Last updated: February 16th 2018
Mostly self explanatory. I wanted network data containing links between web pages. This script gets it done.
"""
Collect links between webpages.
This module is run at the command line.
args
----
base: string
Represents a url to initiate the search.
link: string
Represents a string that must be in the
base url.
flags
-----
-b (--base): string
Represents a url to initiate the search.
-l (--link): string
Represents a string that must be in
the base url.
Usage
------
python netprint.py
--base "https://www.engineering.unsw.edu.au/"
--criteria "engineering"
Output
------
Prints summary stats only. Do something with DF after creation.
"""
blackList = [
'http://www.facebook.com/unsw',
'http://www.twitter.com/unsw',
'https://itunes.apple.com/',
'http://www.youtube.com/unsw',
'http://www.pinterest.com/unsw',
'http://www.instagram.com/unsw',
'/unsw-website-feedback',
'https://itunes.apple.com/au/institution/unsw-university-new-south/id413200225',
'https://www.cloudemail.unsw.edu.au/',
'http://www.handbook.unsw.edu.au',
'http://my.unsw.edu.au',
'https://moodle.telt.unsw.edu.au/',
'https://unsw.sharepoint.com/sites/ENG',
'https://www.facebook.com/UnswFacultyOfEngineering',
'https://www.linkedin.com/',
'https://plus.google.com/',
'http://www.unsw.edu.au/gen/pad/privacy.html',
'http://www.unsw.edu.au/gen/pad/copyright.html',
'http://www.unsw.edu.au/accessibility',
'http://www.engineersaustralia.org.au/',
'http://www.topuniversities.com/qs-stars/',
'https://go8.edu.au/'
]
def collectlinks(url):
"""
Visit a single url and collect links
Args
----
url: string
Represents a url path to search.
Usage
-----
collectlinks("https://www.unsw.edu.au/", "unsw")
"""
my_url_list = []
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")
# find all anchor tags
for a in soup.find_all('a', href=True):
# collect all urls
if (a['href'] not in blackList):
my_url_list.append(a['href'])
return my_url_list
if __name__ == "__main__":
from optparse import OptionParser
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import time
parser = OptionParser()
parser.add_option(
"-b",
"--base",
dest="base",
help="Search start point"
)
parser.add_option(
"-c",
"--criteria",
dest="criteria",
help="Limit searches to paths containing this text"
)
(options, args) = parser.parse_args()
toVisit = [options.base]
visited = {}
count = 0
start = time.time()
while len(toVisit) > 0:
for url in toVisit:
count = count + 1
print(count, len(toVisit), url)
try:
toVisit.remove(url)
# proceed if the...
# url hasnt been viewed yet
# and is not document type
# and is not in the blacklist
# and is not a calendar page
if ((url not in visited) and
(url not in blackList) and
(url.strip().lower().endswith(('.pdf',
'.docx', '.doc', '.txt', '.md')) == False) and
('calendar' not in url)):
collectedUrls = collectlinks(url)
visited[url] = collectedUrls
for colurl in collectedUrls:
if (options.criteria in colurl):
toVisit.append(colurl)
except:
continue
print("Failed to open {}".format(url))
# run time
end = time.time()
seconds = end - start
runTime = str(int(seconds//60)) + " min " + str(int(seconds % 60))) + " sec"
print("\nRun time: {}\n".format(runTime))
# convert dict to list of lists containing URL pairs
results = []
for key, value in visited.items():
try:
for url in value:
urlpair = [key, url]
results.append(urlpair)
except:
print("Problem with {}".format(key))
# print results
df = pd.DataFrame(results)
print(len(df.columns))
print(len(df))