Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Manifest.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar
Empty file added __init__.py
Empty file.
13 changes: 13 additions & 0 deletions dwtc-extension/.classpath
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@
<attribute name="m2e-apt" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
<attributes>
<attribute name="optional" value="true"/>
Expand Down
4 changes: 2 additions & 2 deletions dwtc-extension/.project
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
</natures>
<filteredResources>
<filter>
<id>1605118619462</id>
<id>1705582647115</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.core.resources.regexFilterMatcher</id>
<arguments>node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
<arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
</matcher>
</filter>
</filteredResources>
Expand Down
2 changes: 1 addition & 1 deletion dwtc-extension/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>

<dependencies>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
package webreduce.extension.classification;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;

import com.google.common.base.Optional;
import com.google.inject.name.Named;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import webreduce.data.TableType;
import webreduce.extraction.mh.TableClassification;
import webreduce.extraction.mh.features.FeaturesP1;
import webreduce.extraction.mh.tools.ClassificationResult;
import webreduce.extraction.mh.tools.TableConvert;
import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.Instance;
import webreduce.extension.classification.TableParser.TableParsingException;
import webreduce.extension.classification.TableParser.TableParsingSubTablesException;

public class TableClassificationPhase1 {

private TableConvert tableConverter;
private FeaturesP1 phase1Features;
private Classifier classifier1;
private Attribute classAttr1;
private double layoutVal, relationVal;
private static final int TABLE_MIN_ROWS = 2;
private static final int TABLE_MIN_COLS = 2;
private TableParser tableParser;

public TableClassificationPhase1(@Named("phase1ModelPath") String phase1ModelPath) {
phase1Features = new FeaturesP1();
tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS);
tableParser = new TableParser();

try {
classifier1 = TableClassification.loadModelFromFile(phase1ModelPath);
Expand Down Expand Up @@ -68,10 +62,10 @@ public ClassificationResult classifyTable(Element[][] convertedTable) {
public ClassificationResult classifyTable(String tableHTML) {
Element[][] table;
try {
table = parseTableHTML(tableHTML);
table = tableParser.parseTableHTML(tableHTML);
} catch (TableParsingSubTablesException e) {
System.out.println(e.getMessage());
return new ClassificationResult(TableType.LAYOUT, new double[]{}, null);
return new ClassificationResult(TableType.LAYOUT, new double[] {}, null);
} catch (TableParsingException e) {
System.out.println(e.getMessage());
return null;
Expand All @@ -83,9 +77,9 @@ public double[] computeFeatures(String tableHTML) {
Element[][] table;

try {
table = parseTableHTML(tableHTML);
table = tableParser.parseTableHTML(tableHTML);
} catch (TableParsingException e) {
System.out.println(e.getMessage());
System.out.println(e.getMessage());
return null;
}

Expand All @@ -94,45 +88,27 @@ public double[] computeFeatures(String tableHTML) {
return currentInst.toDoubleArray();
}

private Element[][] parseTableHTML(String tableHTML) throws TableParsingException {
return parseTableHTML(tableHTML, true);
}

private Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException {
Document doc = Jsoup.parse(tableHTML);
Element table = doc.select("table").first();
if (table == null) {
throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification.");
}

Elements subTables = table.getElementsByTag("table");
subTables.remove(table);
if (subTables.size() > 0 && skipSubTables) {
throw new TableParsingSubTablesException(
"Failure, table includes sub-table(s). Skipping table classification.");
}

Optional<Element[][]> convertedTable = tableConverter.toTable(table);
if (!convertedTable.isPresent()) {
throw new TableParsingException("toTable() failed. Skipping table classification.");
}

return convertedTable.get();
}

public class TableParsingException extends Exception {
private static final long serialVersionUID = 5471172109211007529L;

public TableParsingException(String errorMessage) {
super(errorMessage);
}
}

public class TableParsingSubTablesException extends TableParsingException {
private static final long serialVersionUID = -4415254026083906516L;

public TableParsingSubTablesException(String errorMessage) {
super(errorMessage);
}
public static void main(String[] args) throws IOException {
BufferedReader br = new BufferedReader(
new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html"));
try {
StringBuilder sb = new StringBuilder();
String line = br.readLine();

while (line != null) {
sb.append(line);
sb.append(System.lineSeparator());
line = br.readLine();
}
String everything = sb.toString();
TableClassificationPhase1 classificationPhase1 = new TableClassificationPhase1("/Users/yuvalpeleg/projects/web-table-classification/runtime_testing/resources/RandomForest_P1.mdl");
var res = classificationPhase1.classifyTable(everything);
System.out.println(res.toString());

} catch (Exception e) {
System.out.println(e.toString());
} finally {
br.close();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package webreduce.extension.classification;

import org.jsoup.nodes.Element;

import webreduce.data.TableType;
import webreduce.extraction.mh.tools.ClassificationResult;
import webreduce.extraction.mh.TableClassification;
import webreduce.extension.classification.TableParser.TableParsingException;
import webreduce.extension.classification.TableParser.TableParsingSubTablesException;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class TableClassifier {
private TableParser tableParser;
private TableClassification tableClassification;

public TableClassifier() {
tableParser = new TableParser();
tableClassification = new TableClassification(
"/RandomForest_P1.mdl",
"/RandomForest_P2.mdl");
}

public ClassificationResult classifyTable(String tableHTML) {
Element[][] table;
try {
table = tableParser.parseTableHTML(tableHTML);
} catch (TableParsingSubTablesException e) {
System.out.println(e.getMessage());
return new ClassificationResult(TableType.LAYOUT, new double[] {}, null);
} catch (TableParsingException e) {
System.out.println(e.getMessage());
return null;
}
return tableClassification.classifyTable(table);
}

public static void main(String[] args) throws IOException {
BufferedReader br = new BufferedReader(
new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html"));
try {
TableClassification.loadModelFromClasspath("/RandomForest_P1.mdl");
StringBuilder sb = new StringBuilder();
String line = br.readLine();

while (line != null) {
sb.append(line);
sb.append(System.lineSeparator());
line = br.readLine();
}
String everything = sb.toString();
TableClassifier table_classifier = new TableClassifier();
var res = table_classifier.classifyTable(everything);
System.out.println(res.toString());

} catch (Exception e) {
System.out.println(e.toString());
} finally {
br.close();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package webreduce.extension.classification;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.google.common.base.Optional;

import webreduce.extraction.mh.tools.TableConvert;

public class TableParser {
private TableConvert tableConverter;
private static final int TABLE_MIN_ROWS = 2;
private static final int TABLE_MIN_COLS = 2;
public TableParser() {
tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS);
}

public Element[][] parseTableHTML(String tableHTML) throws TableParsingException {
return parseTableHTML(tableHTML, true);
}

public Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException {
Document doc = Jsoup.parse(tableHTML);
Element table = doc.select("table").first();
if (table == null) {
throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification.");
}

Elements subTables = table.getElementsByTag("table");
subTables.remove(table);
if (subTables.size() > 0 && skipSubTables) {
throw new TableParsingSubTablesException(
"Failure, table includes sub-table(s). Skipping table classification.");
}

Optional<Element[][]> convertedTable = tableConverter.toTable(table);
if (!convertedTable.isPresent()) {
throw new TableParsingException("toTable() failed. Skipping table classification.");
}

return convertedTable.get();
}

public class TableParsingException extends Exception {
private static final long serialVersionUID = 5471172109211007529L;

public TableParsingException(String errorMessage) {
super(errorMessage);
}
}

public class TableParsingSubTablesException extends TableParsingException {
private static final long serialVersionUID = -4415254026083906516L;

public TableParsingSubTablesException(String errorMessage) {
super(errorMessage);
}
}
}
Empty file added runtime_testing/__init__.py
Empty file.
Empty file.
Binary file not shown.
67 changes: 67 additions & 0 deletions runtime_testing/resources/table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<table border="0" width="100%" cellspacing="0" cellpadding="0">
<tbody>
<tr bgcolor="#EDF1F3">
<th align="center"><span style="text-decoration: underline;">Catchers</span></th>
<th align="center"><span style="text-decoration: underline;">Infielders</span></th>
<th align="center"><span style="text-decoration: underline;">Outfielders</span></th>
<th align="center"><span style="text-decoration: underline;">Rotation</span></th>
<th align="center"><span style="text-decoration: underline;">Bullpen</span></th>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center">Gary Sanchez</td>
<td align="center">1B Greg Bird</td>
<td align="center">LF Giancarlo Stanton</td>
<td align="center">Masahiro Tanaka</td>
<td align="center">CL Aroldis Chapman</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center">Austin Romine</td>
<td align="center">1B Luke Voit</td>
<td align="center">CF Brett Gardner</td>
<td align="center">James Paxton</td>
<td align="center">SU Zack Britton</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center">2B Gleyber Torres</td>
<td align="center">RF Aaron Judge</td>
<td align="center">J.A. Happ</td>
<td align="center">SU Chad Green</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center">SS Troy Tulowitzki</td>
<td align="center">UTIL Tyler Wade</td>
<td align="center">Luis Cessa</td>
<td align="center">SU Adam Ottavino</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center">3B Miguel Andujar</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">MR Jonathan Holder</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center">IF DJ LeMahieu</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">MR Tommy Kahnle</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center"></td>
<td align="center"></td>
<td align="center"></td>
<td align="center">MR Stephen Tarpley</td>
</tr>
<tr bgcolor="#FFFFFF">
<td align="center"></td>
<td align="center"></td>
<td align="center"></td>
<td align="center"></td>
<td align="center">SWG Domingo German</td>
</tr>
</tbody>
</table>
17 changes: 17 additions & 0 deletions runtime_testing/utils/table_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import jnius_config
import os
jar_file_path = f'{os.path.dirname(os.path.realpath(__file__))}/../resources/dwtc-extension-1.0-jar-with-dependencies.jar'
jnius_config.set_classpath(jar_file_path)

from jnius import autoclass
TableClassifier = autoclass('webreduce.extension.classification.TableClassifier')
table_classifier = TableClassifier()


def classify_table_2_phase(table_html):
return table_classifier.classifyTable(table_html)

if __name__ == '__main__':
with open(f'{os.path.dirname(os.path.realpath(__file__))}/../resources/table.html') as f:
table_html = f.read()
print(classify_table_2_phase(table_html).tableType.toString())
Loading