-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathReadText.py
More file actions
257 lines (205 loc) · 8.56 KB
/
ReadText.py
File metadata and controls
257 lines (205 loc) · 8.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
''' Sonia Moreno, 9/2017
Scrapes data from Carleton Enroll website containing course schedule information.
'''
from __future__ import division
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
import re
import csv
import sys
import json
''' Returns list of academic terms that user can choose from. Item in list
will be passed to function that returns html link with term info provided.
Example: 'term=18WI' in 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=18WI&subject=CS'
'''
def Academic_Term():
# Homepage showing listings of academic terms and course subjects
html_enroll = requests.get('https://apps.carleton.edu/campus/registrar/schedule/enroll/').text
soup2 = BeautifulSoup(html_enroll, 'lxml')
# Tag object containing list of academic terms
term_summary = soup2.find("select", id = "termElement")
# Each term name such as "Winter 2018" has tag "option"
terms = term_summary.find_all("option")
# We want the value attribute; example: <option value="18WI">
# Create list with all value attributes; this will be list of terms available to choose from
term_list = []
for option in terms:
term_list.append(option['value'])
return term_list
''' Returns list of course subjects. Each will be passed to function that returns
appropriate html link which contains specific course information for the subject
Example: 'subject=CS' in 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=18WI&subject=CS'
'''
def Subject():
html_enroll = requests.get('https://apps.carleton.edu/campus/registrar/schedule/enroll/').text
soup2 = BeautifulSoup(html_enroll, 'lxml')
# Tag object containing list of subjects
subject_summary = soup2.find("select", id = "subjectElement")
# Each subject within summary has tag "option"
# Create a list with subjects, excluding 'Selected' tag (1st item)
subjects = subject_summary.find_all("option")[1:]
# Only get the associated text, excluding the tag itself and add them to list
subj_list = []
for item in subjects:
subj_list.append(item.get_text())
# print subj_list
# Each item in subj_list is currently in the form: 'Computer Science (CS)'
# We only want the abbrevation in the parentheses so that we can use this in the html link
# We use regular expressions to achieve this.
subj_abbrev = []
for i in subj_list:
subj_abbrev.append(re.search('\((.*?)\)', i).group(1))
# print subj_abbrev
return subj_abbrev
# Formats strings with spaces to replace spaces with %20
# Makes it so courses are searchable by names in the API
def format_course(text):
course = ''
course_parser = text.split(' ')
for word in course_parser:
course += word
if course_parser.index(word) != len(course_parser)-1 and word != '':
course += '+'
return course
''' Returns dict object with course number, course name, and start/end times for each course
Finds course info based on the academic term and subject chosen (in this case, Winter 2018)
'''
def Specific_Course_Info(term):
with open('ratings.json') as ratings:
d = json.load(ratings)
# print(d)
# Creates dict object with course number as key and list containing name and times for course as values
course_info = defaultdict(list)
subjects = Subject()
index = 1
for subject in subjects:
html_string = 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=' + term + '&subject=' + subject
# Course listings for subject during term provided
html = requests.get(html_string).text
soup = BeautifulSoup(html, 'lxml')
# Creates list of all items with course as class attribute, excluding related courses
course_summary = soup.find_all("div", class_="course")
for course in course_summary:
course_num = course.find(class_= "coursenum").get_text()
course_num = format_course(course_num)
# print(course_num)
# Finds title attribute within each course
title = course.find(class_ = "title").get_text()
# Only takes the actual name of the course
# which is next to the coursenum attribute but not within its own tag
for item in title:
course_name = course.find(class_= "coursenum").next_sibling
course_name = format_course(course_name)[:-1]
# print(course_name)
# Add info to list associated with key
specific_info = {}
# Ensures that no related courses are added
if course_num.find(subject) > -1:
specific_info['department'] = subject
specific_info['term'] = term
specific_info['course_id'] = course_num
specific_info['title'] = course_name
if course.find(class_ = "status") != None:
enrollment = course.find(class_ = "status").get_text()
# specific_info['enrollment'] = enrollment
registered = re.findall(r'(?<=Registered: ).*?(?=\,)', enrollment)[0]
size = re.findall(r'(?<=Size: ).*?(?=\,)', enrollment)[0]
# print registered
specific_info['registered'] = registered
specific_info['size'] = size
# print enrollment
else:
specific_info['registered'] = "n/a"
specific_info['size'] = "n/a"
if course.find(class_ = "faculty") != None:
faculty = course.find(class_ = "faculty").get_text().strip()
specific_info['faculty'] = faculty
if course.find(class_ ="faculty").next_sibling != None:
summary = course.find(class_ = "faculty").next_sibling
summary = summary.encode("utf-8").strip('<p>').strip('</').strip('class="prereq"><em>Prerequisite:</em> Instructor Permission').strip('<span>').strip('</span>')
# Get rid of text within tags
summary = re.sub("[<@*&?].*[>@*&?]", "", summary)
# print(summary)
specific_info['summary'] = summary
else:
specific_info['summary'] = "n/a"
# Add ratemyprofessor.com rating to dictionary
for prof in d:
if prof['teacherfirstname_t'] in faculty and prof['teacherlastname_t'] in faculty or (prof['teacherlastname_t'] in faculty):
specific_info['prof_rating'] = prof['averageratingscore_rf']
# registered = int(registered)
# size = int(size)
# print(size)
# # print(type(registered))
# if size != 0:
# proportion = registered/size
# # print(proportion)
# specific_info['prof_proportion'] = proportion
# specific_info['proportion'] = proportion
else:
specific_info['faculty'] = "n/a"
if course.find(class_ = "credits") != None:
credits = course.find(class_ = "credits").get_text()
specific_info['credits'] = credits
else:
specific_info['credits'] = "n/a"
if course.find(class_ = "codes overlays"):
requirements = course.find(class_ = "codes overlays").get_text().splitlines()
specific_info['requirements_met'] = tuple(requirements[1:])
# course_info[0].append({})
# Start and end times for courses that have set times
# Account for classes without set times
if course.find(class_ = "start") != None:
start_time = course.find("span", {"class": "start"}).get_text()
end_time = course.find(class_ = "end").get_text()
specific_info['start_time'] = start_time
specific_info['end_time'] = end_time
else:
specific_info['start_time'] = "n/a"
specific_info['end_time'] = "n/a"
# course_info[0][0].append(start_time)
specific_info['index'] = index
index +=1
course_info['course_info'].append(specific_info)
# course_info[0][0].append(end_time)
# Creates csv file with course info
# with open('course_info7.csv', 'w') as f:
# w = csv.DictWriter(f, course_info.keys())
# w.writeheader()
# w.writerow(course_info)
# output_file = open('courses_table.csv', 'w')
# writer = csv.writer(output_file)
# for course in course_info['course_info']:
# course_row = [course['course_num'].encode("utf-8"), course['title'].encode("utf-8"), course['start_time'].encode("utf-8"), course['end_time'].encode("utf-8")]
# writer.writerow(course_row)
# output_file.close()
# print course_info
# print(course_info)
filename = 'data' + term + '.json'
with open(filename, 'w') as fp:
json.dump(course_info, fp)
return course_info
# ''' Adds lists together from Specific_Course_Info so that each csv file will contain info
# for ALL subjects in one term
# '''
# def Append_Dicts(a_dict, b_dict):
# return
''' Returns HTML string that Specific Course Info will use to provide information
for every term and subject combination.
'''
def Generate_HTML():
terms = Academic_Term()[1:3]
subjects = Subject()[1:3]
html = []
for term in terms:
for subject in subjects:
print term
print subject
Specific_Course_Info(term, subject)
def main():
# Academic_Term()
# Subject()
# Generate_HTML()
Specific_Course_Info('18WI')
main()