BeautifulSoup and href elements

Date: February 16th 2018
Last updated: February 16th 2018

Mostly self explanatory. I wanted network data containing links between web pages. This script gets it done.

"""
Collect links between webpages.
This module is run at the command line.

args
----

    base: string
        Represents a url to initiate the search.

    link: string
        Represents a string that must be in the
        base url.

flags
-----

    -b (--base): string
        Represents a url to initiate the search.

    -l (--link): string
        Represents a string that must be in
        the base url.

Usage
------

    python netprint.py 
        --base "https://www.engineering.unsw.edu.au/" 
        --criteria "engineering"


Output
------

    Prints summary stats only. Do something with DF after creation.

"""

blackList = [
    'http://www.facebook.com/unsw',
    'http://www.twitter.com/unsw',
    'https://itunes.apple.com/',
    'http://www.youtube.com/unsw',
    'http://www.pinterest.com/unsw',
    'http://www.instagram.com/unsw',
    '/unsw-website-feedback',
    'https://itunes.apple.com/au/institution/unsw-university-new-south/id413200225',
    'https://www.cloudemail.unsw.edu.au/',
    'http://www.handbook.unsw.edu.au',
    'http://my.unsw.edu.au',
    'https://moodle.telt.unsw.edu.au/',
    'https://unsw.sharepoint.com/sites/ENG',
    'https://www.facebook.com/UnswFacultyOfEngineering',
    'https://www.linkedin.com/',
    'https://plus.google.com/',
    'http://www.unsw.edu.au/gen/pad/privacy.html',
    'http://www.unsw.edu.au/gen/pad/copyright.html',
    'http://www.unsw.edu.au/accessibility',
    'http://www.engineersaustralia.org.au/',
    'http://www.topuniversities.com/qs-stars/',
    'https://go8.edu.au/'
]


def collectlinks(url):
    """
    Visit a single url and collect links

    Args
    ----
    url: string
        Represents a url path to search.

    Usage
    -----
    collectlinks("https://www.unsw.edu.au/", "unsw")
    """
    my_url_list = []
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    # find all anchor tags
    for a in soup.find_all('a', href=True):
        # collect all urls
        if (a['href'] not in blackList):
            my_url_list.append(a['href'])
    return my_url_list


if __name__ == "__main__":
    from optparse import OptionParser
    from bs4 import BeautifulSoup
    import pandas as pd
    from urllib.request import urlopen
    import time
    parser = OptionParser()
    parser.add_option(
        "-b",
        "--base",
        dest="base",
        help="Search start point"
        )
    parser.add_option(
        "-c",
        "--criteria",
        dest="criteria",
        help="Limit searches to paths containing this text"
        )
    (options, args) = parser.parse_args()
    toVisit = [options.base]
    visited = {}
    count = 0
    start = time.time()
    while len(toVisit) > 0:
        for url in toVisit:
            count = count + 1
            print(count, len(toVisit), url)
            try:
                toVisit.remove(url)
                # proceed if the...
                # url hasnt been viewed yet
                # and is not document type
                # and is not in the blacklist
                # and is not a calendar page
                if ((url not in visited) and
                    (url not in blackList) and
                    (url.strip().lower().endswith(('.pdf',
                    '.docx', '.doc', '.txt', '.md')) == False) and
                    ('calendar' not in url)):
                    collectedUrls = collectlinks(url)
                    visited[url] = collectedUrls
                    for colurl in collectedUrls:
                        if (options.criteria in colurl):
                            toVisit.append(colurl)
            except:
                continue
                print("Failed to open {}".format(url))

    # run time
    end = time.time()
    seconds = end - start
    runTime = str(int(seconds//60)) + " min " + str(int(seconds % 60))) + " sec"
    print("\nRun time: {}\n".format(runTime))

    # convert dict to list of lists containing URL pairs
    results = []
    for key, value in visited.items():
        try:
            for url in value:
                urlpair = [key, url]
                results.append(urlpair)
        except:
            print("Problem with {}".format(key))

    # print results
    df = pd.DataFrame(results)
    print(len(df.columns))
    print(len(df))

results matching ""

    No results matching ""