CourseSchedulesApp/ReadText.py at master · sonimore/CourseSchedulesApp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
''' Sonia Moreno, 9/2017
 Scrapes data from Carleton Enroll website containing course schedule information.
 '''
from __future__ import division
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
import re
import csv
import sys
import json


''' Returns list of academic terms that user can choose from. Item in list
will be passed to function that returns html link with term info provided.
Example: 'term=18WI' in 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=18WI&subject=CS'
'''
def Academic_Term():
	# Homepage showing listings of academic terms and course subjects
	html_enroll = requests.get('https://apps.carleton.edu/campus/registrar/schedule/enroll/').text
	soup2 = BeautifulSoup(html_enroll, 'lxml')

	# Tag object containing list of academic terms
	term_summary = soup2.find("select", id = "termElement")

	# Each term name such as "Winter 2018" has tag "option"
	terms = term_summary.find_all("option")

	# We want the value attribute; example: <option value="18WI">
	# Create list with all value attributes; this will be list of terms available to choose from
	term_list = []
	for option in terms:
		term_list.append(option['value'])
	return term_list


''' Returns list of course subjects. Each will be passed to function that returns
appropriate html link which contains specific course information for the subject
Example: 'subject=CS' in 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=18WI&subject=CS'
'''
def Subject():
	html_enroll = requests.get('https://apps.carleton.edu/campus/registrar/schedule/enroll/').text
	soup2 = BeautifulSoup(html_enroll, 'lxml')


	# Tag object containing list of subjects
	subject_summary = soup2.find("select", id = "subjectElement")

	# Each subject within summary has tag "option"
	# Create a list with subjects, excluding 'Selected' tag (1st item)
	subjects = subject_summary.find_all("option")[1:]

	# Only get the associated text, excluding the tag itself and add them to list
	subj_list = []
	for item in subjects:
		subj_list.append(item.get_text())

	# print subj_list

	# Each item in subj_list is currently in the form: 'Computer Science (CS)'
	# We only want the abbrevation in the parentheses so that we can use this in the html link
	# We use regular expressions to achieve this.
	subj_abbrev = []
	for i in subj_list:
		subj_abbrev.append(re.search('\((.*?)\)', i).group(1))

	# print subj_abbrev

	return subj_abbrev

# Formats strings with spaces to replace spaces with %20
# Makes it so courses are searchable by names in the API
def format_course(text):
    course = ''
    course_parser = text.split(' ')
    for word in course_parser:
        course += word
        if course_parser.index(word) != len(course_parser)-1 and word != '':
            course += '+'
    return course

''' Returns dict object with course number, course name, and start/end times for each course
Finds course info based on the academic term and subject chosen (in this case, Winter 2018)
'''

def Specific_Course_Info(term):

	with open('ratings.json') as ratings:
    		d = json.load(ratings)
    		# print(d)

	# Creates dict object with course number as key and list containing name and times for course as values

	course_info = defaultdict(list)

	subjects = Subject()
	index = 1
	for subject in subjects:
		html_string = 'https://apps.carleton.edu/campus/registrar/schedule/enroll/?term=' + term + '&subject=' + subject

		# Course listings for subject during term provided
		html = requests.get(html_string).text

		soup = BeautifulSoup(html, 'lxml')

		# Creates list of all items with course as class attribute, excluding related courses
		course_summary = soup.find_all("div", class_="course")
		for course in course_summary:
			course_num = course.find(class_= "coursenum").get_text()
			course_num = format_course(course_num)
			# print(course_num)
			# Finds title attribute within each course
			title = course.find(class_ = "title").get_text()
			# Only takes the actual name of the course
			# which is next to the coursenum attribute but not within its own tag
			for item in title:
				course_name = course.find(class_= "coursenum").next_sibling
				course_name = format_course(course_name)[:-1]

			# print(course_name)

			# Add info to list associated with key
			specific_info = {}
			# Ensures that no related courses are added
			if course_num.find(subject) > -1:
				specific_info['department'] = subject
				specific_info['term'] = term
				specific_info['course_id'] = course_num
				specific_info['title'] = course_name
				if course.find(class_ = "status") != None:
					enrollment = course.find(class_ = "status").get_text()
					# specific_info['enrollment'] = enrollment
					registered = re.findall(r'(?<=Registered: ).*?(?=\,)', enrollment)[0]
					size = re.findall(r'(?<=Size: ).*?(?=\,)', enrollment)[0]
					# print registered
					specific_info['registered'] = registered
					specific_info['size'] = size
					# print enrollment
				else:
					specific_info['registered'] = "n/a"
					specific_info['size'] = "n/a"

				if course.find(class_ = "faculty") != None:
					faculty = course.find(class_ = "faculty").get_text().strip()
					specific_info['faculty'] = faculty
					if course.find(class_ ="faculty").next_sibling != None:
							summary = course.find(class_ = "faculty").next_sibling
							summary = summary.encode("utf-8").strip('<p>').strip('</').strip('class="prereq"><em>Prerequisite:</em> Instructor Permission').strip('<span>').strip('</span>')
							# Get rid of text within tags
							summary = re.sub("[<@*&?].*[>@*&?]", "", summary)
							# print(summary)
							specific_info['summary'] = summary
					else:
							specific_info['summary'] = "n/a"
					# Add ratemyprofessor.com rating to dictionary
					for prof in d:
						if prof['teacherfirstname_t'] in faculty and prof['teacherlastname_t'] in faculty or (prof['teacherlastname_t'] in faculty):
							specific_info['prof_rating'] = prof['averageratingscore_rf']
							# registered = int(registered)
							# size = int(size)
							# print(size)
							# # print(type(registered))
							# if size != 0:
							# 	proportion = registered/size
							# 	# print(proportion)

							# 	specific_info['prof_proportion'] = proportion
							# specific_info['proportion'] = proportion


				else:
					specific_info['faculty'] = "n/a"
				if course.find(class_ = "credits") != None:
					credits = course.find(class_ = "credits").get_text()
					specific_info['credits'] = credits
				else:
					specific_info['credits'] = "n/a"
				if course.find(class_ = "codes overlays"):
					requirements = course.find(class_ = "codes overlays").get_text().splitlines()
					specific_info['requirements_met'] = tuple(requirements[1:])


				# course_info[0].append({})
				# Start and end times for courses that have set times
				# Account for classes without set times
				if course.find(class_ = "start") != None:
					start_time = course.find("span", {"class": "start"}).get_text()
					end_time = course.find(class_ = "end").get_text()
					specific_info['start_time'] = start_time
					specific_info['end_time'] = end_time

				else:
					specific_info['start_time'] = "n/a"
					specific_info['end_time'] = "n/a"
					# course_info[0][0].append(start_time)

				specific_info['index'] = index
				index +=1

				course_info['course_info'].append(specific_info)

					# course_info[0][0].append(end_time)

	# Creates csv file with course info
	# with open('course_info7.csv', 'w') as f:
	# 	w = csv.DictWriter(f, course_info.keys())
	# 	w.writeheader()
	# 	w.writerow(course_info)

	# output_file = open('courses_table.csv', 'w')
	# writer = csv.writer(output_file)
	# for course in course_info['course_info']:
	# 	course_row = [course['course_num'].encode("utf-8"), course['title'].encode("utf-8"), course['start_time'].encode("utf-8"), course['end_time'].encode("utf-8")]
	# 	writer.writerow(course_row)
	# output_file.close()
	# print course_info


	# print(course_info)
	filename = 'data' + term + '.json'
	with open(filename, 'w') as fp:
		json.dump(course_info, fp)
	return course_info


# ''' Adds lists together from Specific_Course_Info so that each csv file will contain info
# for ALL subjects in one term
# '''
# def Append_Dicts(a_dict, b_dict):
# 	return


''' Returns HTML string that Specific Course Info will use to provide information
for every term and subject combination.
'''
def Generate_HTML():
	terms = Academic_Term()[1:3]
	subjects = Subject()[1:3]
	html = []
	for term in terms:
		for subject in subjects:
			print term
			print subject
		Specific_Course_Info(term, subject)

def main():
	# Academic_Term()
	# Subject()
	# Generate_HTML()
	Specific_Course_Info('18WI')

main()