forked from bartvbl/itslearning-dumper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
2144 lines (1689 loc) · 90.4 KB
/
scrape.py
File metadata and controls
2144 lines (1689 loc) · 90.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
# A dirty script to pull files from it's learning
# - Bart van Blokland
# - Updated by Karthik Hari (2023)
# USAGE:
# 0. Install Python 3.x (including pip) if you haven't already
# 1. Install required packages (`pip install -r requirements.txt`)
# 2. Run the script (`python scrape.py`)
# WHAT IS DOWNLOADED BY THIS SCRIPT?
# 1. All messages, both through the new and old system, including attachments
# 2. Bulletin messages and text posts, including comments
# 3. Assignments.
# 3.1: If you're a student, your submission
# 3.2: If you teach the course, all student submissions, grades, and feedback
# 4. Forum posts. Includes any attached images.
# 5. Notes and links (both old and new style)
# 6. Surveys and tests (if you have sufficient access, the It's Learning generated reports are also snagged along)
# 7. Files
# The same folder structure as the course is kept.
# There may still be things the script doesn't grab. But at least the most important parts are there.
# You may need to set It's Learning's language to English (might be an innsida setting, not sure).
# Some bits and pieces rely on the english language pack (sorry, there was no easy other way)
# --- IMPORTS ---
# Imports for fixing freezing issues
from multiprocessing import Queue
import requests
from requests.exceptions import InvalidURL
from lxml.html.soupparser import fromstring
from lxml.html import tostring, HtmlElement
from lxml import etree
# Python std lib imports
import os.path
import os
import re
import html
import sys
import platform
import json
import traceback
import base64
from shutil import rmtree
from time import sleep
import getpass
from urllib.parse import urlparse
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
# Requires Python 3.4
from pathlib import Path
from requests.sessions import session
from src.fcss_sso import adfsLogin
import src.itslearning_urls as its
from src.parser import makeParser
# --- SETTINGS ---
parser = makeParser()
disable_warnings(category=InsecureRequestWarning)
args = parser.parse_args()
# I've sprinkled delays around the code to ensure the script isn't spamming requests at maximum rate.
# Each time such a delay occurs, it waits for this many seconds.
# Feel free to increase this if you plan to run this script overnight.
rate_limiting_delay_seconds = args.rate_limit
# Some bits and pieces of text may contain special characters. Since a number of these are used
# in file paths and file names, they have to be filtered out.
# Filename characters are deleted from file names
# File path characters are only deleted from entire/complete file paths
# I may have missed one of two.
invalid_path_characters = [':', ',', '*', '?', '"', '<', '>', '\t', '`', '´', '|']
invalid_filename_characters = [':', ',', '*', '/', '\\', '?', '"', '<', '>', '\t', '`', '´', '|']
# All output files will receive this extension. Since a lot of stuff contains raw HTML, I used HTML
# as the file type. You may want to change this to .txt though, since many files also contain plaintext bits.
output_text_extension = args.output_extension
output_folder_name = args.output_dir
# Use if the program crashed and stopped early. Skips to a course with a specific index
# If this value is non-zero, also downloading of the messaging inbox will be skipped.
# The index is 1-indexed, and corresponds to the course index listed on the print messages in the console
# when the dumping of a new course is started.
skip_to_course_with_index = max(args.skip_to_course, (1 if args.courses_only or args.projects_only else 0))
# --- INTRO ---
if not args.do_listing:
print('----- It\'s Learning dump script -----')
print('Created by: Bart van Blokland (bart.van.blokland@ntnu.no)')
print('Updated in 2023 by: Karthik Hari (hello@khari.me)')
print()
print('Greetings! This script will help you download your content off of It\'s Learning.')
print('We\'ll start by selecting a directory where all the files are going to be saved.')
if os.name == 'nt':
print()
print('NOTE: Since you\'re a Windows user, please keep in mind that file paths can only be 255 characters long. This is a Windows limitation I can\'t do anything about.')
print('This script has a fallback option for files which can not be created due to this limitation by saving them to a single directory.')
print('For the best results, I recommend creating a folder in the root of your hard drive. For example; C:\\dump or D:\\dump.')
print('You can do this by clicking on My Computer while selecting a directory, double clicking on a hard drive, creating a directory named \'dump\', and selecting it.')
print('This will cause the least number of files to overflow.')
print()
# Determines where the program dumps its output.
# Note that the tailing slash is mandatory.
output_folder_name = args.output_dir
is_directory_empty = False
while not is_directory_empty:
if output_folder_name is None:
input('Press Enter to continue and select a directory.')
# User interface goodies
try:
# A bit ugly, but libraries already imported won't be imported again, so actually, all is good.
import tkinter
from tkinter.filedialog import askdirectory
except ImportError as ie:
print('')
print('!!! Could not import tkinter.')
print("If you don't have tkinter installed, specify the output dir by using the output parameter '--output-dir'")
print('')
raise ie
tkinter.Tk().withdraw()
output_folder_name = askdirectory()
if output_folder_name == '':
print('Folder selection cancelled. Aborting.')
sys.exit(0)
output_folder_name = os.path.abspath(output_folder_name)
is_directory_empty = not os.listdir(output_folder_name)
if args.recreate_out_dir and os.path.exists(output_folder_name):
print('Recreating output directory..')
rmtree(output_folder_name)
os.makedirs(output_folder_name)
is_directory_empty = not os.listdir(output_folder_name)
if not is_directory_empty:
print()
print('The selected directory is not empty, which the script needs to work properly.')
print('Press enter to try again, and select a new one.')
print('You can always create a new directory and select it; that one will always be empty.')
print()
input('Press Enter to continue and try selecting a directory again.')
print('Selected output folder:', output_folder_name)
elif args.recreate_out_dir and os.path.exists(output_folder_name):
print('Recreating output directory..')
rmtree(output_folder_name)
os.makedirs(output_folder_name)
# If a crash occurs, the script can skip all elements in folders up to the point where it left off.
# The state is stored in a small text file created inside the working directory.
# Turn this setting on to allow the creation of these checkpoints. They are only really useful if you can fix the issue causing the crash in the first place.
enable_checkpoints = args.enable_checkpoints
# --- CONSTANTS ---
# pretend to be a Mac running Firefox
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0'}
institutions = ['forsyth']
progress_file_location = os.path.join(os.getcwd(), 'saved_progress_state.txt')
overflow_count = 0
# If an override institution was specified, redefine the institutions list so it is the only one in there.
if args.institution is not None:
institutions = [args.institution.lower()]
# --- HELPER FUNCTIONS ---
def delay():
sleep(rate_limiting_delay_seconds)
def convert_html_content(html_string):
unescaped = html.unescape(html_string).split('\n')
return '\n'.join([string.strip() for string in unescaped])
def sanitisePath(filePath):
# I don't care about efficiency.
for character in invalid_path_characters:
# Fix for absolute paths on windows
if os.name == 'nt' and os.path.isabs(filePath) and character == ':':
filePath = filePath[0:2] + filePath[2:].replace(character, '')
else:
filePath = filePath.replace(character, '')
filePath = '/'.join([m.strip() for m in filePath.split('/')])
return filePath
def createUniqueFilename(path):
path_exported = Path(path)
folder_parts = path_exported.parts[0:-1]
file_name = '.'.join(path_exported.name.split('.')[0:-1])
extension = path_exported.name.split('.')[-1]
count = 1
while os.path.exists(path):
path = '/'.join(folder_parts) + '/' + file_name + ' (Duplicate ' + str(count) + ').' + extension
count += 1
return path
def sanitiseFilename(filename):
for character in invalid_filename_characters:
filename = filename.replace(character, '')
return filename
def makeDirectories(path):
cleaned_path = sanitisePath(path)
abs_path = os.path.abspath(cleaned_path)
if not os.path.exists(abs_path):
try:
os.makedirs(abs_path)
except FileNotFoundError:
print('COULD NOT CREATE A FOLDER AT: ')
print(abs_path.encode('ascii', 'ignore'))
print('If you\'re on Windows this can happen due to Windows being unable to handle paths longer than 255 characters.')
print('Any files dumped in these directories will be redirected to the overflow directory.')
return abs_path
# Windows has this amazing feature called "255 character file path limit"
# Here's a function made specifically for countering this issue.
def dumpToOverflow(content, filename):
global overflow_count
overflow_count += 1
filepath, basename = os.path.split(os.path.normpath(sanitisePath(filename)))
dirtree = filepath.split(os.sep)
overflowDirectory = output_folder_name + '/' + dirtree[2] + '/Overflowed Files'
if not os.path.exists(overflowDirectory):
overflowDirectory = makeDirectories(overflowDirectory)
total_path = sanitisePath(overflowDirectory + '/' + str(overflow_count) + '_' + basename)
with open(total_path, 'wb') as file:
file.write(content)
# Create a txt file with original file location
overflow_info_txt_filename = os.path.splitext(os.path.normpath(total_path))[0] + '.txt'
original_location = ('Original file path: ' + filename).encode('utf-8')
bytesToTextFile(original_location, overflow_info_txt_filename)
print('FILE WAS WRITTEN TO OVERFLOW DIRECTORY - path too long (Windows issue)')
print('Original file path:', filename.encode('ascii', 'ignore'))
print('New file path:', total_path.encode('ascii', 'ignore'))
def bytesToTextFile(content, filename):
filename = sanitisePath(filename)
filename = os.path.abspath(createUniqueFilename(filename))
if len(filename) >= 254 and 'Windows' in platform.system():
dumpToOverflow(content, filename)
else:
with open(filename, 'wb') as file:
file.write(content)
# Conversion between formats of one library to another
def convert_lxml_form_to_requests(lxml_form_values):
form_dict = {}
for item in lxml_form_values.form_values():
form_dict[item[0]] = item[1]
return form_dict
def do_feide_relay(session, relay_response):
relay_page = fromstring(relay_response.text)
relay_form = relay_page.forms[0]
relay_form_dict = convert_lxml_form_to_requests(relay_form)
return session.post(relay_form.action, data = relay_form_dict)
def download_file(institution, url, destination_directory, session, index=None, filename=None, disableFilenameReencode=False):
try:
file_download_response = session.get(url, allow_redirects=True)
except Exception:
# Can occur in a case of an encoded image. If so, dump it.
if its.base64_png_image_url.format(its.root_url[institution]) in url or its.base64_jpeg_image_url.format(its.root_url[institution]) in url:
try:
extension = url.split(':')[2].split(';')[0].split('/')[1]
print('\tDownloaded Base64 encoded {} image'.format(extension).encode('ascii', 'ignore'))
start_index = url.index(',') + 1
base64_encoded_file_contents = url[start_index:]
decoded_bytes = base64.b64decode(base64_encoded_file_contents)
bytesToTextFile(decoded_bytes, destination_directory + '/' + base64_encoded_file_contents[0:10] + '.' + extension)
except Exception:
print('Base64 Image Download Failed: unknown umage formatting. Skipping.')
return
elif url.startswith('/'):
try:
file_download_response = session.get(its.root_url[institution] + url, allow_redirects=True)
except Exception:
print('FAILED TO DOWNLOAD FILE (INVALID URL):', url.encode('ascii', 'ignore'))
return
else:
print('FAILED TO DOWNLOAD FILE (INVALID URL):', url.encode('ascii', 'ignore'))
return
# If links are not directed to it's learning, the header format might be different
if filename is None:
try:
filename_header = file_download_response.headers['Content-Disposition']
filename_start = filename_header.split('filename="')[1]
filename_end = filename_start.find('"')
filename = filename_start[0:filename_end]
except (KeyError, IndexError):
# Hope that the filename was part of the URL
filename = os.path.basename(urlparse(url).path)
if index is not None:
filename = str(index) + '_' + filename
# Fix shitty decoding done by requests
if not disableFilenameReencode:
initial_filename = filename
try:
filename = filename.encode('latin1').decode('utf-8')
except UnicodeDecodeError:
filename = initial_filename
except UnicodeEncodeError:
filename = initial_filename
# Special case where the server puts slashes in the file name
# sanitiseFilename() cuts away too many characters here.
filename = filename.replace('/', '')
# check if it downloads a PDF without assigning a file extension -k
if file_download_response.content.startswith(b'%PDF') and not filename.lower().endswith('.pdf'):
filename = filename + '.pdf'
print('\tDownloaded', filename)
if not os.path.exists(destination_directory):
destination_directory = makeDirectories(destination_directory)
filename = sanitisePath(filename)
total_file_name = os.path.abspath(sanitisePath(destination_directory) + "/" + filename)
total_file_name = createUniqueFilename(total_file_name)
if len(total_file_name) >= 255 and 'Windows' in platform.system():
dumpToOverflow(file_download_response.content, total_file_name)
else:
with open(total_file_name, 'wb') as outputFile:
outputFile.write(bytearray(file_download_response.content))
# Add sleep for rate limiting
delay()
return filename
def doPostBack(page_url, postback_action, page_document, postback_parameter=None):
postback_form = None
for form in page_document.forms:
if '__EVENTTARGET' in form.fields:
postback_form = form
break
if postback_form is None:
raise Exception('No postback form found on page!\nURL: ' + page_url)
postback_form = page_document.forms[0]
postback_form.fields['__EVENTTARGET'] = postback_action
if postback_parameter is not None:
postback_form.fields['__EVENTARGUMENT'] = postback_parameter
post_data = convert_lxml_form_to_requests(postback_form)
# Submitting the form to obtain the next page
headers = {}
headers['Referer'] = page_url
messaging_response = session.post(page_url, headers=headers, data=post_data, allow_redirects=True)
return messaging_response
def loadPaginationPage(page_url, current_page_document, backpatch_character_index = 6):
next_page_button = current_page_document.find_class('previous-next')
found_next_button = False
for element in next_page_button:
if len(element) > 0 and (element[0].get('title') == 'Next' or element[0].get('title') == 'Neste'):
next_page_button = element
found_next_button = True
break
if not found_next_button:
print('\tItem complete: this was the last page')
return False, None
print('\tLoading next page')
# Locating the event name for obtaining the next page
post_back_event = next_page_button[0].get('id')
# Backpatching event title
post_back_event = list(post_back_event)
post_back_event[backpatch_character_index] = '$'
post_back_event = ''.join(post_back_event)
messaging_response = doPostBack(page_url, post_back_event, current_page_document)
return True, messaging_response
# --- DUMPING OF VARIOUS BITS OF ITS LEARNING FUNCTIONALITY ---
def processTest(institution, pathThusFar, testURL, session):
test_response = session.get(testURL, allow_redirects = True)
test_document = fromstring(test_response.text)
test_title = test_document.find_class('ccl-pageheader')[0][0].text_content()
print('\tDownloading test/survey:', test_title.encode('ascii', 'ignore'))
dumpDirectory = pathThusFar + '/Test - ' + test_title
dumpDirectory = sanitisePath(dumpDirectory)
dumpDirectory = makeDirectories(dumpDirectory)
manualDumpDirectory = dumpDirectory + '/Explicit dump'
manualDumpDirectory = makeDirectories(manualDumpDirectory)
try:
# If we have access to downloading all results, we do so here.
# Since 'we can, grabbing both XLS and HTML reports.'
show_result_url = its.root_url[institution] + test_document.get_element_by_id('result')[0].get('href')[2:]
download_file(institution, show_result_url + '&Type=2', dumpDirectory, session, disableFilenameReencode=False)
download_file(institution, show_result_url + '&Type=2&HtmlType=true', dumpDirectory, session, disableFilenameReencode=False)
print('\tIt\'s Learning generated report downloaded.')
except KeyError:
# No problem that we can't see the 'show result button, the manual dump will catch whatever is visible to us'
pass
pages_remaining = True
while pages_remaining:
row_index = 0
entries_remaining = True
while entries_remaining:
try:
table_entry_element = test_document.get_element_by_id('row_{}'.format(row_index))
except KeyError:
# End the loop when there are no more submissions
print('\tAll entries found on page.')
entries_remaining = False
continue
index_offset = 0
if len(table_entry_element[0]) > 0 and table_entry_element[0][0].get('id') is not None and 'check' in table_entry_element[0][0].get('id'):
index_offset = 1
entry_name = table_entry_element[0 + index_offset].text_content()
entry_date = table_entry_element[1 + index_offset].text_content()
try:
entry_url = its.root_url[institution] + table_entry_element[2 + index_offset][0].get('href')[2:]
except IndexError:
# Happens if you don't have access rights to view the responses.
entry_url = None
if entry_url is not None:
print('\tDownloading response from', entry_name.encode('ascii', 'ignore'))
entry_response = session.get(entry_url, allow_redirects=True)
entry_document = fromstring(entry_response.text)
file_content = convert_html_content(etree.tostring(entry_document.find_class('itsl-formbox')[0]).decode('utf-8')).encode('utf-8')
file_name = manualDumpDirectory + '/' + sanitiseFilename(entry_name) + ' ' + sanitiseFilename(entry_date) + output_text_extension
bytesToTextFile(file_content, file_name)
else:
print('\tSkipping response from', entry_name.encode('ascii', 'ignore'), ': No response present or insufficient privileges.')
row_index += 1
delay()
# Searching for the next pagination button
# Of course this page has its own mechanism for this
next_page_button = test_document.find_class('previous-next')
found_next_button = False
for element in next_page_button:
if len(element) > 0 and (element[0].get('title') == 'Next' or element[0].get('title') == 'Neste'):
next_page_button = element
found_next_button = True
break
if not found_next_button:
print('\tNo more pages found. All items have been downloaded.')
break
next_page_url = html.unescape(next_page_button[0].get('href'))[2:]
print('\tPage finished, moving on to next page.')
test_response = session.get(its.root_url[institution] + next_page_url, allow_redirects = True)
test_document = fromstring(test_response.text)
def processNote(institution, pathThusFar, noteURL, session):
note_response = session.get(noteURL, allow_redirects=True)
note_document = fromstring(note_response.text)
note_title_node = note_document.find_class('ccl-pageheader')[0]
note_title = sanitiseFilename(note_title_node[0].text_content())
print("\tDownloaded note:", note_title.encode('ascii', 'ignore'))
dumpDirectory = pathThusFar + '/Note - ' + note_title
dumpDirectory = sanitisePath(dumpDirectory)
dumpDirectory = makeDirectories(dumpDirectory)
note_content_div = note_document.find_class('h-userinput')[0]
for image_tag in note_content_div.iterfind(".//img"):
image_URL = image_tag.get('src')
download_file(institution, image_URL, dumpDirectory, session)
bytesToTextFile(etree.tostring(note_content_div), dumpDirectory + '/' + note_title + output_text_extension)
def processWeblink(institution, pathThusFar, weblinkPageURL, link_title, session):
print('\tDownloading weblink: ', link_title.encode('ascii', 'ignore'))
weblink_response = session.get(weblinkPageURL, allow_redirects=True)
weblink_document = fromstring(weblink_response.text)
header_frame = weblink_document.find(".//frame")
header_src = header_frame.get('src')
weblink_header_response = session.get(its.weblink_header_base_url.format(its.root_url[institution]) + header_src.split('=')[1], allow_redirects=True)
weblink_header_document = fromstring(weblink_header_response.text)
link_info_node = weblink_header_document.find_class('frameheaderinfo')[0]
try:
weblink_url = etree.tostring(link_info_node[0][1], encoding='utf-8')
except IndexError:
# Some older versions have some comment/section/other cruft. It's hard to tell how to get the info out consistently, so let's try one way and hope for the best.
weblink_url = etree.tostring(link_info_node.find_class('standardfontsize')[0][1], encoding='utf-8')
link_title = sanitiseFilename(link_title)
link_file_content = (link_title + '\n\n').encode('utf-8') + weblink_url
bytesToTextFile(link_file_content, pathThusFar + '/Link - ' + link_title + output_text_extension)
def processLearningToolElement(institution, pathThusFar, elementURL, session):
element_response = session.get(elementURL, allow_redirects=True)
element_document: HtmlElement = fromstring(element_response.text)
element_title = element_document.get_element_by_id('ctl00_PageHeader_TT').text
element_title = sanitiseFilename(element_title)
try:
frameSrc = element_document.get_element_by_id('ctl00_ContentPlaceHolder_ExtensionIframe').get('src')
frame_content_response = session.get(frameSrc, allow_redirects=True)
frame_document: HtmlElement = fromstring(frame_content_response.content.decode())
try: # check if this learning tool has a single download link
download_link = its.resource_url + frame_document.get_element_by_id('ctl00_ctl00_MainFormContent_DownloadLinkForViewType').get('href')
download_file(institution, download_link, pathThusFar, session, filename=element_title)
except: # it isn't a single file
print('\tDownloaded Learning Tool Element: ', element_title)
dumpDirectory = pathThusFar + '/Learning Tool Element - ' + element_title
dumpDirectory = sanitisePath(dumpDirectory)
dumpDirectory = makeDirectories(dumpDirectory)
bytesToTextFile(frame_content_response.content, dumpDirectory + '/page_contents' + output_text_extension)
frame_content_document = fromstring(frame_content_response.text)
for file_link in frame_content_document.find_class('ccl-iconlink'):
link_href = file_link[0].get('href')
link_filename = file_link[0].get('download')
if link_filename is None: continue
download_file(institution, link_href, dumpDirectory, session, filename=link_filename)
except KeyError:
print('\tPage appears to have abnormal page structure. Falling back on dumping entire page as-is.')
bytesToTextFile(etree.tostring(element_document, pretty_print=True, encoding='utf-8'), dumpDirectory + '/page_contents' + output_text_extension)
def processPicture(institution, pathThusFar, pictureURL, session):
picture_response = session.get(pictureURL, allow_redirects=True)
picture_document = fromstring(picture_response.text)
element_title = picture_document.find_class('ccl-pageheader')[0].text_content()
element_title = sanitiseFilename(element_title)
print('\tDownloaded Picture:', element_title.encode('ascii', 'ignore'))
dumpDirectoryPath = pathThusFar + '/Picture - ' + element_title
dumpDirectoryPath = makeDirectories(dumpDirectoryPath)
image_base_element = picture_document.find_class('itsl-formbox')[0]
imageURL = its.root_url[institution] + image_base_element[0][0].get('src')
download_file(institution, imageURL, dumpDirectoryPath, session)
description_text = etree.tostring(image_base_element[2], encoding='utf-8')
bytesToTextFile(description_text, dumpDirectoryPath + "/caption" + output_text_extension)
def processDiscussionPost(institution, pathThusFar, postURL, postTitle, session):
print("\tDownloading thread:", postTitle.encode('ascii', 'ignore'))
post_response = session.get(postURL, allow_redirects=True)
post_document = fromstring(post_response.text)
post_table_tag = post_document.find_class('threadViewTable')[0]
post_table_root = post_table_tag
if post_table_root[0].tag == 'tbody':
post_table_root = post_table_root[0]
postDumpDirectory = pathThusFar + '/Thread - ' + sanitiseFilename(postTitle)
completeDumpFile = postDumpDirectory
duplicateCount = 1
while os.path.exists(completeDumpFile):
if duplicateCount > 1:
completeDumpFile = postDumpDirectory + ' (Duplicate '+str(duplicateCount)+')'
else:
completeDumpFile = postDumpDirectory + ' (Duplicate)'
duplicateCount += 1
completeDumpFile = sanitisePath(completeDumpFile)
fileContents = ''
tags_to_next_entry = 0
for index, post_tag in enumerate(post_table_root):
if tags_to_next_entry != 0:
# Each post is 3 tags
tags_to_next_entry -= 1
continue
tags_to_next_entry = 2
fileContents += '-------------------------------------------------------------------------\n'
post_contents_tag = post_tag.getnext()
is_post_deleted = 'deleted' in post_contents_tag[0][0].get('class')
if is_post_deleted:
# Deleted posts do not have a third footer entry like regular posts do, so we move on the the next page 1 tag earlier.
tags_to_next_entry -= 1
if not is_post_deleted:
footer_tag = post_contents_tag.getnext()
try:
author = 'Author: ' + post_tag[0][2][0].text
except IndexError:
# Fallback option, probably due to an anonymous post
author = 'Author: ' + post_tag[0][0].get('alt')
post_content = convert_html_content(etree.tostring(post_contents_tag[0][0]).decode('utf-8'))
# Also download any images shown in the post
for image_tag in post_contents_tag[0][0].iterfind(".//img"):
imageDumpDirectory = pathThusFar + '/Attachments'
if not os.path.exists(imageDumpDirectory):
imageDumpDirectory = makeDirectories(imageDumpDirectory)
image_URL = image_tag.get('src')
# For some reason there can be images containing nothing on a page. No idea why.
if image_URL is None:
continue
# Special case for relative URL's: drop the It's Learning root URL in front of it
if not image_URL.startswith('http'):
image_URL = its.root_url[institution] + image_URL
download_file(institution, image_URL, imageDumpDirectory, session)
delay()
if not is_post_deleted:
timestamp = footer_tag[0][0][0].text.strip()
else:
timestamp = ''
fileContents += author + '\n' + timestamp + '\n\n' + post_content + '\n\n'
bytesToTextFile(fileContents.encode('utf-8'), completeDumpFile + output_text_extension)
# Add a time delay before moving on to the next post
delay()
def processDiscussionForum(institution, pathThusFar, discussionURL, session):
discussion_response = session.get(discussionURL, allow_redirects=True)
discussion_document = fromstring(discussion_response.text)
# They are sooo inconsistent with these conventions.
discussion_title = sanitiseFilename(discussion_document.get_element_by_id('ctl05_TT').text)
print("\tDownloaded discussion:", discussion_title.encode('ascii', 'ignore'))
discussionDumpDirectory = pathThusFar + '/Discussion - ' + discussion_title
discussionDumpDirectory = sanitisePath(discussionDumpDirectory)
discussionDumpDirectory = makeDirectories(discussionDumpDirectory)
# hacky way of retrieving the discussion ID, which we need for fetching the threads.
discussionID = discussionURL.split('=')[1]
# ThreadID starts counting at 1 because ID 0 is the table header.
threadID = 1
pages_remaining = True
# Pagination
while pages_remaining:
nextThreadElement = discussion_document.get_element_by_id('Threads_' + str(threadID))
if nextThreadElement[0].text is None or (not nextThreadElement[0].text.startswith('No threads') and not nextThreadElement[0].text.startswith('Ingen hovedinnlegg')):
while nextThreadElement is not None and nextThreadElement != False:
postURL = nextThreadElement[1][0].get('href')
postTitle = nextThreadElement[1][0].text
try:
processDiscussionPost(institution, discussionDumpDirectory, its.root_url[institution] + postURL, postTitle, session)
except Exception:
print('\n\nSTART OF ERROR INFORMATION\n\n\n\n')
traceback.print_exc()
print('\n\n\n\nEND OF ERROR INFORMATION')
print()
print('Oh no! The script crashed while trying to download the following discussion post:')
print((its.root_url[institution] + postURL).encode('ascii', 'ignore'))
print('Some information regarding the error is shown above.')
print('Please mail a screenshot of this information to bart.van.blokland@ntnu.no, and I can see if I can help you fix it.')
print('Would you like to skip this item and move on?')
print('Type \'skip\' if you\'d like to skip this element and continue downloading any remaining elements, or anything else if you\'d like to abort the download.')
decision = input('Skip this element? ')
if decision != 'skip':
print('Download has been aborted.')
sys.exit(0)
threadID += 1
try:
nextThreadElement = discussion_document.get_element_by_id('Threads_' + str(threadID))
except KeyError:
nextThreadElement = False
else:
bytesToTextFile('No threads were created in this forum.'.encode('utf-8'), discussionDumpDirectory + '/No threads.txt')
# Move on to next page
found_next_page, discussion_response = loadPaginationPage(discussionURL, discussion_document, backpatch_character_index=7)
if found_next_page:
discussion_document = fromstring(discussion_response.text)
# Start at the first thread on the next page
threadID = 1
else:
pages_remaining = False
def processAssignment(institution, pathThusFar, assignmentURL, session):
print("\tDownloading assignment:", assignmentURL.encode('ascii', 'ignore'))
assignment_response = session.get(assignmentURL, allow_redirects=True)
if its.unauthorized_url[institution] in assignment_response.url:
print('\t Access denied. Skipping.')
return
assignment_document = fromstring(assignment_response.text)
#writeHTML(assignment_document, 'output.html')
assignment_title = assignment_document.get_element_by_id('ctl05_TT').text
dumpDirectory = pathThusFar + '/Assignment - ' + assignment_title
dumpDirectory = sanitisePath(dumpDirectory)
dumpDirectory = makeDirectories(dumpDirectory)
assignment_answer_table = assignment_document.find_class('itsl-assignment-answer')
# Download the assignment description
details_sidebar_element = assignment_document.find_class('ccl-rwgm-column-1-3')[0]
description_element = assignment_document.find_class('ccl-rwgm-column-2-3')[0]
assignment_description = convert_html_content(etree.tostring(description_element[1], encoding='utf-8').decode('utf-8'))
details_element = details_sidebar_element[1]
assignment_details = ''
for element in details_element:
# Just dump the table on the right sidebar as-is
assignment_details += ' '.join(convert_html_content(element.text_content()).split('\n')).strip() + '\n'
assignment_details += '\nTask description:\n\n' + assignment_description
bytesToTextFile(assignment_details.encode('utf-8'), dumpDirectory + '/Assignment description' + output_text_extension)
# Download assignment description files
file_listing_element = description_element[2][1]
for file_element in file_listing_element:
file_url = file_element[0].get('href')
download_file(institution, file_url, dumpDirectory, session)
# Download own submission, but only if assignment was answered
if assignment_answer_table:
answerDumpDirectory = dumpDirectory + '/Own answer'
answerDumpDirectory = makeDirectories(answerDumpDirectory)
# For some reason not all answers have a tbody tag.
assignment_answer_root = assignment_answer_table[0]
if assignment_answer_root[0].tag == 'tbody':
assignment_answer_root = assignment_answer_root[0]
assessment_file_contents = ''.encode('utf-8')
# Part 1: The table describing when you submitted, who evaluated you, etc
baseInformationTable = assignment_answer_table[0].getprevious()
while not baseInformationTable.tag == 'table':
baseInformationTable = baseInformationTable.getprevious()
if baseInformationTable[0].tag == 'tbody':
baseInformationTable = baseInformationTable[0]
for entry in baseInformationTable:
if entry is None or entry[0] is None or entry[0].text is None:
continue
assessment_file_contents += (entry[0].text + ': ').encode('utf-8') + etree.tostring(entry[1], encoding='utf-8') + '\n'.encode('utf-8')
# Part 2: The table containing your submitted files and feedback
for entry in assignment_answer_root:
if entry is None or entry[0] is None or entry[0].text is None:
continue
if entry[0].text.startswith('Files') or entry[0].text.startswith('Filer'):
file_list_div = entry[1][0]
for index, file_entry in enumerate(file_list_div):
if file_entry.tag == 'section':
continue
if len(file_entry) == 0:
continue
file_index = None
if len(file_list_div) > 2:
file_index = index
file_location = file_entry[0][0].get('href')
download_file(institution, file_location, answerDumpDirectory, session, file_index)
else:
for attached_file in entry.find_class('ccl-iconlink'):
file_location = attached_file.get('href')
download_file(institution, file_location, answerDumpDirectory, session)
assessment_file_contents += (entry[0].text + ': ').encode('utf-8') + etree.tostring(entry[1], encoding='utf-8') + '\n'.encode('utf-8')
bytesToTextFile(assessment_file_contents, answerDumpDirectory + '/assessment.html')
filter_box_present = True
try:
assignment_document.get_element_by_id('EssayAnswers_ctl00_groupFilter_filter')
except KeyError:
filter_box_present = False
# First, we'll try to disable all filters, to ensure everything is downloaded.
if filter_box_present:
filter_elements = assignment_document.get_element_by_id('EssayAnswers_ctl00_groupFilter_filter')
# Check all filter checkboxes
postback_form = None
for form in assignment_document.forms:
if '__EVENTTARGET' in form.fields:
postback_form = form
break
# There should be a postback form on every page though
if postback_form is not None:
for form_input_name in postback_form.fields:
if form_input_name.startswith('EssayAnswers$ctl00$groupFilter'):
postback_form.inputs[form_input_name].checked = True
# And do a postback to get a page with no filters applied
postback_response = doPostBack(assignmentURL, 'EssayAnswers$ctl00$groupFilter', assignment_document, postback_parameter='filter')
assignment_document = fromstring(postback_response.text)
answers_submitted = True
try:
assignment_document.get_element_by_id('EssayAnswers_0')
# Having 2 table entries is guaranteed if there are any answers present.
# However, if any filters have been applied we need to ensure
if len(assignment_document.get_element_by_id('EssayAnswers_1')) <= 1:
answers_submitted = False
except Exception:
answers_submitted = False
print('\tNo answers detected.')
if answers_submitted:
student_submissions = dumpDirectory + '/Student answers'
student_submissions = makeDirectories(student_submissions)
# Index 0 is the table header, which we skip
pages_remaining = True
while pages_remaining:
submission_index = 1
answers_remaining = True
while answers_remaining:
try:
submission_element = assignment_document.get_element_by_id('EssayAnswers_{}'.format(submission_index))
except KeyError:
# End the loop when there are no more submissions
answers_remaining = False
continue
#for i in range(0, 10):
# try:
# print(i, ':', etree.tostring(submission_element[i]))
# except IndexError:
# pass
no_group_index_offset = 0
if 'No group' in submission_element[2].text_content() or 'Ingen gruppe' in submission_element[2].text_content():
no_group_index_offset = 1
elif 'Manage' in submission_element[2].text_content() or 'Administrer' in submission_element[2].text_content():
no_group_index_offset = 1
elif 'New group' in submission_element[2].text_content() or 'Ny gruppe' in submission_element[2].text_content():
no_group_index_offset = 1
#print("Index offset:", no_group_index_offset)
# Exploits that solution links have no text with coloured highlighting
try:
plagiarism_text_element = submission_element[6 + no_group_index_offset][0]
has_plagiarism_report = plagiarism_text_element.get('class') is not None and ('colorbox' in plagiarism_text_element.get('class') or 'h-hidden' in plagiarism_text_element.get('class'))
except IndexError:
has_plagiarism_report = False
plagiarism_index_offset = 0
if has_plagiarism_report:
plagiarism_index_offset = 1
score_index_offset = 1
#print('plagiarism offset:', plagiarism_index_offset)
# Column 0: Checkbox
# Column 1: Student names
try:
students = [link[0].text for link in submission_element[1].find_class('ccl-iconlink')]
except Exception:
students = [submission_element[1].text_content()]
if not students:
students = [submission_element[1].text_content()]
# Column 2: Submission date/time
submission_time = submission_element[2 + no_group_index_offset].text
# Column 3: Review date
review_date = submission_element[3 + no_group_index_offset].text
# Column 4: Status
status = submission_element[4 + no_group_index_offset].text_content()
# Column 5: Score
# If nobody answered the assignment, all of the next elements are not present and thus will fail
try:
if not ('Show' in submission_element[5 + no_group_index_offset].text_content() or 'Vis' in submission_element[5 + no_group_index_offset].text_content()):
score = submission_element[5 + no_group_index_offset].text
else:
# We have hit the assignment details link. This requires adjusting the offset
score_index_offset = 0
score = None
except IndexError:
score = None
# Column 6: Plagiarism status
if has_plagiarism_report:
try:
plagiarism_status = submission_element[6 + no_group_index_offset].text_content()
except IndexError:
plagiarism_status = None
else:
plagiarism_status = None
# Column 7: Show (link to details page)
try:
# Exploit that the last entry is always the details link
details_page_url = its.root_url[institution] + submission_element[len(submission_element) - 1][0].get('href')
except IndexError:
details_page_url = None
has_submitted = submission_time is not None and not 'Not submitted' in submission_time and not 'Ikke levert' in submission_time
if submission_time is None:
submission_time = 'Not submitted.'
if review_date is None:
review_date = 'Not assessed.'
if score is None:
score = ''
if plagiarism_status is None:
plagiarism_status = 'No plagiarism check has been done.'
print('\tDownloading assignment submission ', students[0].encode('ascii', 'ignore'))
comment_field_contents = ''
details_page_content = None
# Only download solution if one was submitted
if has_submitted:
details_page_response = session.get(details_page_url, allow_redirects = True)
details_page_content = fromstring(details_page_response.content)
assessment_form_element = details_page_content.get_element_by_id('AssessForm')
comment_field_element = assessment_form_element.get_element_by_id('AssessForm_comments_EditorCKEditor_ctl00')
comment_field_contents = convert_html_content(etree.tostring(comment_field_element).decode('utf-8'))
answer_directory = student_submissions + '/' + sanitiseFilename(students[0])
answer_directory = makeDirectories(answer_directory)
# Write out assessment details to a file