-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathParseCoursePage.py
114 lines (96 loc) · 5.52 KB
/
ParseCoursePage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#===============================================================================
# Name: Parse Course Page
# Purpose: To parse the websites using AKARI Curriculum Management version 4.0.29 to show course and module content
# and extract those details into files as its not possible to do so directly in Rapidminer 5.
# Author: Michael O'Brien
# Requirements:
# Python 2.7.10 including pip package to install the BeautifulSoup4 library
# Libraries
# BeautifulSoup4 used to extract the content from the html
# SoupStrainer used to reduce the amount of parsing down on the module pages
# urlib2 used to call the url's
# Created: Nov 2015
#===============================================================================
# Import the beautiful soup library
from bs4 import BeautifulSoup
# Import the soup strainer
from bs4 import SoupStrainer
#To help deal with text in different formats
from bs4 import UnicodeDammit
# import urllib2 library to actually go get the webpage for Beautiful Soup
import urllib2
import sys
# Definitions
# function to help extract the relevant content from nested tables using the table header value then strip and clean the text
def parse_table_by_header_name(Source_table, header):
try:
RequiredTable = Source_table.find("th", text=header).find_parent("table")
parsed_text = RequiredTable.get_text()
# Remove the 1st occurance of the header text from the result
parsed_text = parsed_text.replace(header, "", 1)
# Remove the extra whitespaces
parsed_text = parsed_text.strip()
return parsed_text #
except :
e = sys.exc_info()
print(e)
Value = "PARSING TABLE ERROR" + header
return Value
# function to help extract the relevant content from nested tables using cssClass Value and some unique text value then strip and clean the text
def parse_table_by_Contents_and_cssClass(Source_table, FindThisText, cssClass):
try:
for tables in Source_table.find_all("table", class_=cssClass):
rawText = tables.get_text()
if rawText.find(FindThisText) != -1:
return rawText.strip()
except :
e = sys.exc_info()
print(e)
Value = "PARSING ERROR TABLE by Contents " + FindThisText +'and CSSClass ' + cssClass
return Value
def main(CourseURL ='http://courses.it-tallaght.ie/index.cfm/page/course/courseId/30', BaseURL = 'http://courses.it-tallaght.ie/'):
#Process the Course URL stored in the file
Spacer = "----------Parsing Course Page ------_"
CourseText = "<div> Course--Details</div>"
#The results will be stored in a Dictionary Data Structure
ResultsDictionary ={}
# This strainer is used to only import the module descriptor page with the give id and the rest isn't parsed.
ModuleDescriptorStrainer = SoupStrainer(id="moduleDescriptor")
# Open the webpage using urlib and store in a Soup object using the html parser
WebContent = urllib2.urlopen(CourseURL)
soup = BeautifulSoup(WebContent, "html.parser")
# Just get the table with the programme details
ProgrammeDetails = soup.find(id="programmeDescriptor")
# print(ProgrammeDetails.prettify())
# print(ProgrammeDetails.get_text())
#print(Spacer)
ResultsDictionary['CourseAward'] = parse_table_by_header_name(ProgrammeDetails, "Awards")
ResultsDictionary['CourseCode'] = parse_table_by_header_name(ProgrammeDetails, "Programme Code:")
ResultsDictionary['CourseDelivery'] = parse_table_by_header_name(ProgrammeDetails, "Mode of Delivery:")
ResultsDictionary['SemesterCount'] = parse_table_by_header_name(ProgrammeDetails, "No. of Semesters:")
ResultsDictionary['CourseNFQLevel'] = parse_table_by_header_name(ProgrammeDetails, "NFQ Level:")
ResultsDictionary['CourseDepartment'] = parse_table_by_header_name(ProgrammeDetails, "Department:")
# The Course Outcomes are stored in a table with a css class id of borders but thats not unique so include a string to get the Outcomes text
ResultsDictionary['CourseOutcomes'] = parse_table_by_Contents_and_cssClass(ProgrammeDetails, "Knowledge - Breadth", "borders")
# Get the module links
#print("-----------------Module Tables-----------------")
for courselink in ProgrammeDetails.findAll('a'):
# Build up the url needed
ModuleLink = BaseURL + courselink.get('href')
print(Spacer)
print('Now processing module', ModuleLink, 'in course code', ResultsDictionary['CourseCode'])
# Get the module page but strain it first so only the module details are parsed and not the rest of the webpage content
WebContent = urllib2.urlopen(ModuleLink)
ModuleSoup = BeautifulSoup(WebContent, "html.parser", parse_only=ModuleDescriptorStrainer)
# Store the html output and add a spacer string between modules with their content stored in html if you use ModuleSoup.prettify(formatter="html")
# Or use ModuleSoup.get_text()
CourseText = CourseText + ModuleSoup.prettify(formatter="html")
print(Spacer)
#Outside of For loop assign the Course Text to the Results Dictionary as all the modules would be loaded by then
ResultsDictionary['CourseModuleText'] = CourseText
print("Results stored in ",ResultsDictionary.keys(),"Dictionary Keys")
#print(ResultsDictionary['CourseModuleText'])
# Return the values stored in the dictionary
return ResultsDictionary
if __name__ == '__main__':
main()