-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTMLReader.java
More file actions
106 lines (91 loc) · 3.93 KB
/
HTMLReader.java
File metadata and controls
106 lines (91 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import java.util.Scanner;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.BufferedWriter;
import java.io.IOException;
public class HTMLReader {
public static void main(String[] args) {
File path = new File("/Users/noellependergraft/Documents/Research/AAA/html");
File[] files = path.listFiles();
for (File file:files){
try {
FileWriter output = new FileWriter("AAA.csv");
Scanner scan = new Scanner(file).useDelimiter("\\s+");
String line = scan.nextLine();
//this string & its loop are used to skip through all the junk at the beginning until the scanner reaches the review section
String reviewTime = ",\"review\": [";
//the total number of reviews that exist in/should be parsed from the files
int reviewNumber = 198;
while (!line.contains(reviewTime)) {
if (scan.hasNextLine()) {
line = scan.nextLine();
} else {
break;
}
}
while (scan.hasNextLine()) {
line = scan.nextLine();
String type = "\"@type\": \"";
if (line.contains(type)) {
//type is an element in the html, will generally be "Review" but could potentially be "Response"
//html line looks like "@type": "Review",
//t stands for type in the scanner name
Scanner t = new Scanner(line).useDelimiter("\"@type\": \"|\",\\s*");
t.next();
output.write(t.next() + "\t");
t.close();
String line2 = scan.nextLine();
//this should extract the year, month, and date from HTML format "datePublished": "2017-04-13 21:56:51+00:00",
//d stands for date
Scanner d = new Scanner(line2).useDelimiter("\"datePublished\": \"|\",|-");
d.next();
output.write(d.next() + "\t");
output.append(d.next() + "\t");
output.append(d.next().substring(0,2) + "\t");
d.close();
String line3 = scan.nextLine();
//this extracts the review body
//rb stands for review body
Scanner rb = new Scanner(line3).useDelimiter("\"reviewBody\": \"|\",");
rb.next();
output.write(rb.next() + "\t");
rb.close();
//this skips line "author": {
//author containes an @type and a name
scan.nextLine();
String line4 = scan.nextLine();
//at stands for author type: generally extracts Person
Scanner at = new Scanner(line4).useDelimiter("\"@type\": \"|\",");
at.next();
output.write(at.next() + "\t");
at.close();
String line5 = scan.nextLine();
//name is an html element formatted as Cristin of Vallejo, CA
//called author because it contains both author name and author location information
Scanner author = new Scanner(line5).useDelimiter("\"name\":\"|\"| of |, ");
author.next();
output.write(author.next() + "\t");
output.write(author.next() + "\t");
output.write(author.next() + "\t");
author.close();
//skips unnecessary information
scan.nextLine();
scan.nextLine();
scan.nextLine();
scan.nextLine();
String line6 = scan.nextLine();
//extracts the 1-star to 5-star rating given to the company by the review author
Scanner rating = new Scanner(line6).useDelimiter("\"ratingValue\": \"|\",");
rating.next();
output.write(rating.next() + "\n");
rating.close();
}
}
output.close();
} catch (IOException e) {
System.out.println("File not found exception.");
}
}
}
}