In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import csv
import os
import re

In [None]:
def open_browser():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--remote-debugging-port=9222')
    chrome_options.add_argument('webdriver.chrome.driver=chromedriver/chromedriver')
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def manual_login(base_url):
    url = os.path.join(base_url, 'index.jsp')
    driver = open_browser()
    driver.get(url)
    ### user now logs in. After logging in, enter 'y' to continue.
    ok = input('Ready? ([y]/n): ')
    if ok.lower() == 'n':
        exit()
    url = os.path.join(base_url, 'menu.jsp')
    return driver, url

In [None]:
class Scraper:

	# relevant ids for the scraper
    # note that these can change as and when updates are made to asc
    ids = {
        'Academic':             ('//*[@id="ygtvlabelel1"]',0),
        'All About Courses':    ('//*[@id="ygtvlabelel7"]',0),
        'Running Courses':      ('//a[@id="4_59_1"]',1),
        'Grading Statistics':   ('//a[@id="4_71_1"]',1),
    }
        
    possible_grades = ['AA', 'AB', 'AP', 'AU', 'BB', 'BC', 'CC', 'CD', 'DD', 'FF', 'FR', 'PP', 'NP', 'DX', 'W', 'II']

    def __init__(self, driver: webdriver.Chrome, url: str) -> None:
        self.url = url
        self.driver = driver
        self.driver.get(self.url)
        self.window_idx = 0

    def refresh(self):
        self.switch_window(0)
        self.driver.refresh()

    def switch_window(self, window_idx: int = 0):
        self.window_idx = window_idx
        self.driver.switch_to.window(self.driver.window_handles[self.window_idx])

    def click_path(self, path: str):
        path = path.split('/')
        print(path)
        for p in path:
            xpath, window_inc = self.ids[p]
            self.driver.find_element(By.XPATH, value=xpath).click()
            self.window_idx += window_inc
            if window_inc:
                self.driver.switch_to.window(self.driver.window_handles[self.window_idx])
        return self.window_idx
    
    def get_all_depts(self, year: str, semester: str):
        self.choose_option('year', year)
        self.choose_option('semester', semester)
        self.driver.find_element(By.XPATH, value="//input[@name='submit']").click()
        all_depts = self.driver.find_elements(By.XPATH, value="//a[contains(@href, 'deptcd')]")
        depts = [re.search(r'(?<=deptcd=)[A-Z,]+', dept.get_attribute('href')).group() for dept in all_depts]
        depts = [dept.replace(',', '-') for dept in depts]
        # depts = '-'.join(depts).split('-')
        return depts
    
    def get_courses(self, year: str, semester: str, dept: str, savefilename: str):
        '''gets all courses offered by {dept} in {year}-{semester}, saves to {savefilename}'''
        self.choose_option('year', year)
        self.choose_option('semester', semester)
        self.driver.find_element(By.XPATH, value="//input[@name='submit']").click()
        self.driver.find_element(By.XPATH, value=f'//a[contains(@href, "{dept}")]').click()
        all_rows = self.driver.find_elements(By.XPATH, value='//table[1]/tbody/tr')[4:]
        courses = []
        n = len(all_rows)
        for i in range(n):
            all_rows = self.driver.find_elements(By.XPATH, value='//table[1]/tbody/tr')[4:] # refresh
            row = all_rows[i]
            cols = row.find_elements(By.XPATH, value='td')
            if len(cols) != 14: continue # ignore spurious rows (see html to figure out what this is for)
            code = cols[2].text
            name = cols[3].text
            instructor = cols[6].text.split('- ')[1:]
            for i, ins in enumerate(instructor):
                instructor[i] = ins.split('\n')[0]
            instructor = '\n'.join(instructor)
            slot = cols[8].text.split('\n')[0]

            # get credits, text, desc
            cols[2].find_element(By.XPATH, value='a').click()
            summary = self.driver.find_elements(By.XPATH, value='//table[1]/tbody/tr')
            creds = summary[2].text.replace('Total Credits ', '')
            refs = summary[9].text.replace('Text Reference ', '')
            if refs == ' ':
                refs = 'None'
            summ = summary[10].text.replace('Description ', '')
            if summ == ' ':
                summ = 'None'
            self.driver.back()
            courses.append((code, name, instructor, slot, creds, refs, summ))
        self.driver.back()

        # save to csv
        with open(savefilename, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['Code', 'Name', 'Instructor', 'Slot', 'Credits', 'Text Reference', 'Description'])
            writer.writerows(courses)
        return courses
    
    def choose_option(self, name: str, option: str):
        '''select an option from a dropdown menu'''
        select = self.driver.find_element(By.XPATH, f"//select[@name='{name}']")
        select.click()
        options = select.find_elements(By.XPATH, "//option")
        for opt in options:
            if opt.text == option:
                opt.click()
                break

    def put(self, name: str, text: str):
        '''fill in a text field named {name} with {text}'''
        elem = self.driver.find_element(By.XPATH, f"//input[@name='{name}']")
        elem.send_keys(Keys.COMMAND, 'a')
        elem.send_keys(text)

    def collect_grade(self, code):
        self.put('txtcrsecode', code)
        submit_button = self.driver.find_element(By.XPATH, "//input[@name='submit']")
        submit_button.click()
        tables = self.driver.find_elements(By.XPATH, '//table')[3:] # ignore first 3 tables
        grades = []
        for table in tables:
            data = table.text.split('\n')
            sect = data[0].split(' ')[-3]
            full_code = code + ('-' + sect if sect != 'for' else '')
            data = list(map(lambda x: x.split(' '), data[1:-1]))
            grade_dict = {gr: 0 for gr in self.possible_grades}
            for grade, count in data:
                assert grade in self.possible_grades, f'{grade} not in {self.possible_grades}'
                grade_dict[grade] = int(count)
            grades.append((full_code, grade_dict.copy()))
        self.driver.back()
        return grades

    def get_grades(self, year: str, semester: str, courses: list[str], savefilename: str):
        grades = []
        self.choose_option('year', year)
        self.choose_option('semester', semester)
        grades = sum((self.collect_grade(course) for course in courses), [])
        with open(savefilename, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['Code'] + self.possible_grades)
            for code, grade_dict in grades:
                writer.writerow([code] + list(grade_dict.values()))
        return grades

In [None]:
year = '2022-2023'
semester = '1 - Autumn'
course_dir = f'courses-{year}-{semester}'
grade_dir = f'grades-{year}-{semester}'
os.makedirs(course_dir, exist_ok=True)
os.makedirs(grade_dir, exist_ok=True)

coursefilename = lambda dept: f'{course_dir}/{dept}.csv'
gradefilename = lambda dept: f'{grade_dir}/{dept}.csv'

In [None]:
def courses_from_file(coursesfilename: str):
    with open(coursesfilename, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        courses = sorted(set((map(lambda x: x[0], reader))))
    return courses

def get_all_courses(scraper: Scraper, depts: list[str], year: str, semester: str):
    for dept in depts:
        dept_short = dept.split('-')[0]
        scraper.get_courses(year, semester, dept_short, coursefilename(dept))

def get_all_grades(scraper: Scraper, depts: list[str], year: str, semester: str):
    for dept in depts:
        courses = courses_from_file(coursefilename(dept))
        print(courses)
        scraper.get_grades(year, semester, courses, gradefilename(dept))

In [None]:
base_url = 'https://asc.iitb.ac.in/acadmenu'
login = manual_login(base_url)

In [None]:
scraper = Scraper(*login)

In [None]:
scraper.click_path('Academic/All About Courses/Running Courses')

In [None]:
depts = scraper.get_all_depts(year, semester)[:3]

In [None]:
depts

In [None]:
# in case of crashes, run this cell again
depts = depts[len(os.listdir(course_dir)):]
get_all_courses(scraper, depts, year, semester)

In [None]:
# computing grading stats

In [None]:
year_for_grade = year[:4]
semester_for_grade = semester[0]

In [None]:
scraper.switch_window(0)

In [None]:
scraper.click_path('Grading Statistics')

In [None]:
get_all_grades(scraper, depts, year_for_grade, semester_for_grade)

In [None]:
# some patches, mostly data cleaning
courses_dir = f'courses-{year}-{semester}'
cleaned_courses_dir = f'cleaned-courses-{year}-{semester}'

def remove_duplicate_courses(courses_dir, cleaned_courses_dir):
    for filename in os.listdir(courses_dir):
        if not filename.endswith('.csv'):
            continue
        with open(f'{courses_dir}/{filename}', 'r') as f:
            reader = csv.reader(f)
            hdr = next(reader)
            courses = {}
            for row in reader:
                if row[0] in courses:
                    instrs = row[2].replace('\n', ', ').split(', ')
                    instrs = [instr for instr in instrs if instr != '']
                    instrs = set(instrs)
                    courses[row[0]][2] = courses[row[0]][2].union(instrs)
                else:
                    instrs = row[2].replace('\n', ', ').split(', ')
                    instrs = [instr for instr in instrs if instr != '']
                    instrs = set(instrs)
                    courses[row[0]] = row[:2] + [instrs] + row[3:]
                if row[0] == 'MA 110':
                    print(row[2])
                    print(courses[row[0]])
                    # exit()
        with open(f'{cleaned_courses_dir}/{filename}', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(hdr)
            for code in sorted(courses):
                writer.writerow(courses[code][:2] + [', '.join(courses[code][2])] + courses[code][3:])

In [None]:
os.makedirs(cleaned_courses_dir, exist_ok=True)
remove_duplicate_courses(courses_dir, cleaned_courses_dir)

In [None]:
len(os.listdir('cleaned-' + course_dir)), len(os.listdir(grade_dir)), len(depts)

In [None]:
import statistics

coursedir = f'cleaned-courses-{year}-{semester}'
gradedir = f'grades-{year}-{semester}'
possible_grades = ['AA', 'AB', 'AP', 'AU', 'BB', 'BC', 'CC', 'CD', 'DD', 'FF', 'FR', 'PP', 'NP', 'DX', 'W', 'II']
grade_to_num = [10, 9, 10, 6, 8, 7, 6, 5, 4, 0, 0, 10, 0, 0, 0, 0]

concat = open(f'allcourses-{year}-{semester}.csv', 'w')
writer = csv.writer(concat)
writer.writerow(['Code', 'Name', 'Instructor', 'Slot', 'Credits', 'Text Reference', 'Description', 'Mean', 'Median', 'Mode', 'Std. Dev.'] + possible_grades)

depts = []
for filename in sorted(os.listdir(coursedir)):
    if filename.endswith('.csv'):
        depts.append(filename[:-4])

print(depts)

grades = {}
for dept in depts:
    grades[dept] = {} # {code: [grades]}
    with open(f'{gradedir}/{dept}.csv', 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            code = row[0]
            grades[dept][code] = list(map(int, row[1:]))

for dept in depts:
    with open(f'{coursedir}/{dept}.csv', 'r') as f:
        reader = csv.reader(f)
        next(reader)
        codes = set()
        for row in reader:
            code = row[0]
            found = False
            for code_sec in grades[dept]:
                if code != code_sec.split('-')[0]:
                    continue
                found = True
                grds = grades[dept][code_sec]
                print(grds, dept, code)
                grades_list = sum(([float(grade_points)]*num_given for grade_points, num_given in zip(grade_to_num, grds)), [])
                # print(grades_list)
                if len(grades_list) == 0:
                    continue # pathos, man, pathos
                if len(grades_list) > 1:
                    mean = statistics.mean(grades_list)
                    median = statistics.median(grades_list)
                    mode = statistics.mode(grades_list)
                    stddev = statistics.stdev(grades_list)
                else:
                    mean = median = mode = grades_list[0]
                    stddev = 0.
                writer.writerow([code_sec] + row[1:] + [mean, median, mode, stddev] + grds)
            if not found:
                grds = ['NA']*len(possible_grades)
                mean = median = mode = stddev = 'NA'
                writer.writerow(row + [mean, median, mode, stddev] + grds)