diff --git a/Manifest.in b/Manifest.in
new file mode 100644
index 0000000..4a5a926
--- /dev/null
+++ b/Manifest.in
@@ -0,0 +1 @@
+include runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dwtc-extension/.classpath b/dwtc-extension/.classpath
index 9ba41a2..b3a4342 100644
--- a/dwtc-extension/.classpath
+++ b/dwtc-extension/.classpath
@@ -31,6 +31,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dwtc-extension/.project b/dwtc-extension/.project
index 3238494..0b5d123 100644
--- a/dwtc-extension/.project
+++ b/dwtc-extension/.project
@@ -22,12 +22,12 @@
- 1605118619462
+ 1705582647115
30
org.eclipse.core.resources.regexFilterMatcher
- node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
+ node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
diff --git a/dwtc-extension/pom.xml b/dwtc-extension/pom.xml
index ae5cb37..8e169f0 100644
--- a/dwtc-extension/pom.xml
+++ b/dwtc-extension/pom.xml
@@ -13,7 +13,7 @@
UTF-8
11
- 11
+ 1.7
diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java
index 2f86741..93dede7 100644
--- a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java
+++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java
@@ -1,37 +1,31 @@
package webreduce.extension.classification;
-
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
import java.util.Arrays;
-
-import com.google.common.base.Optional;
import com.google.inject.name.Named;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
import webreduce.data.TableType;
import webreduce.extraction.mh.TableClassification;
import webreduce.extraction.mh.features.FeaturesP1;
import webreduce.extraction.mh.tools.ClassificationResult;
-import webreduce.extraction.mh.tools.TableConvert;
import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.Instance;
+import webreduce.extension.classification.TableParser.TableParsingException;
+import webreduce.extension.classification.TableParser.TableParsingSubTablesException;
public class TableClassificationPhase1 {
- private TableConvert tableConverter;
private FeaturesP1 phase1Features;
private Classifier classifier1;
private Attribute classAttr1;
private double layoutVal, relationVal;
- private static final int TABLE_MIN_ROWS = 2;
- private static final int TABLE_MIN_COLS = 2;
+ private TableParser tableParser;
public TableClassificationPhase1(@Named("phase1ModelPath") String phase1ModelPath) {
phase1Features = new FeaturesP1();
- tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS);
+ tableParser = new TableParser();
try {
classifier1 = TableClassification.loadModelFromFile(phase1ModelPath);
@@ -68,10 +62,10 @@ public ClassificationResult classifyTable(Element[][] convertedTable) {
public ClassificationResult classifyTable(String tableHTML) {
Element[][] table;
try {
- table = parseTableHTML(tableHTML);
+ table = tableParser.parseTableHTML(tableHTML);
} catch (TableParsingSubTablesException e) {
System.out.println(e.getMessage());
- return new ClassificationResult(TableType.LAYOUT, new double[]{}, null);
+ return new ClassificationResult(TableType.LAYOUT, new double[] {}, null);
} catch (TableParsingException e) {
System.out.println(e.getMessage());
return null;
@@ -83,9 +77,9 @@ public double[] computeFeatures(String tableHTML) {
Element[][] table;
try {
- table = parseTableHTML(tableHTML);
+ table = tableParser.parseTableHTML(tableHTML);
} catch (TableParsingException e) {
- System.out.println(e.getMessage());
+ System.out.println(e.getMessage());
return null;
}
@@ -94,45 +88,27 @@ public double[] computeFeatures(String tableHTML) {
return currentInst.toDoubleArray();
}
- private Element[][] parseTableHTML(String tableHTML) throws TableParsingException {
- return parseTableHTML(tableHTML, true);
- }
-
- private Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException {
- Document doc = Jsoup.parse(tableHTML);
- Element table = doc.select("table").first();
- if (table == null) {
- throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification.");
- }
-
- Elements subTables = table.getElementsByTag("table");
- subTables.remove(table);
- if (subTables.size() > 0 && skipSubTables) {
- throw new TableParsingSubTablesException(
- "Failure, table includes sub-table(s). Skipping table classification.");
- }
-
- Optional convertedTable = tableConverter.toTable(table);
- if (!convertedTable.isPresent()) {
- throw new TableParsingException("toTable() failed. Skipping table classification.");
- }
-
- return convertedTable.get();
- }
-
- public class TableParsingException extends Exception {
- private static final long serialVersionUID = 5471172109211007529L;
-
- public TableParsingException(String errorMessage) {
- super(errorMessage);
- }
- }
-
- public class TableParsingSubTablesException extends TableParsingException {
- private static final long serialVersionUID = -4415254026083906516L;
-
- public TableParsingSubTablesException(String errorMessage) {
- super(errorMessage);
- }
+ public static void main(String[] args) throws IOException {
+ BufferedReader br = new BufferedReader(
+ new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html"));
+ try {
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ sb.append(line);
+ sb.append(System.lineSeparator());
+ line = br.readLine();
+ }
+ String everything = sb.toString();
+ TableClassificationPhase1 classificationPhase1 = new TableClassificationPhase1("/Users/yuvalpeleg/projects/web-table-classification/runtime_testing/resources/RandomForest_P1.mdl");
+ var res = classificationPhase1.classifyTable(everything);
+ System.out.println(res.toString());
+
+ } catch (Exception e) {
+ System.out.println(e.toString());
+ } finally {
+ br.close();
}
+ }
}
diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java
new file mode 100644
index 0000000..e62d36f
--- /dev/null
+++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java
@@ -0,0 +1,63 @@
+package webreduce.extension.classification;
+
+import org.jsoup.nodes.Element;
+
+import webreduce.data.TableType;
+import webreduce.extraction.mh.tools.ClassificationResult;
+import webreduce.extraction.mh.TableClassification;
+import webreduce.extension.classification.TableParser.TableParsingException;
+import webreduce.extension.classification.TableParser.TableParsingSubTablesException;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+
+public class TableClassifier {
+ private TableParser tableParser;
+ private TableClassification tableClassification;
+
+ public TableClassifier() {
+ tableParser = new TableParser();
+ tableClassification = new TableClassification(
+ "/RandomForest_P1.mdl",
+ "/RandomForest_P2.mdl");
+ }
+
+ public ClassificationResult classifyTable(String tableHTML) {
+ Element[][] table;
+ try {
+ table = tableParser.parseTableHTML(tableHTML);
+ } catch (TableParsingSubTablesException e) {
+ System.out.println(e.getMessage());
+ return new ClassificationResult(TableType.LAYOUT, new double[] {}, null);
+ } catch (TableParsingException e) {
+ System.out.println(e.getMessage());
+ return null;
+ }
+ return tableClassification.classifyTable(table);
+ }
+
+ public static void main(String[] args) throws IOException {
+ BufferedReader br = new BufferedReader(
+ new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html"));
+ try {
+ TableClassification.loadModelFromClasspath("/RandomForest_P1.mdl");
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+
+ while (line != null) {
+ sb.append(line);
+ sb.append(System.lineSeparator());
+ line = br.readLine();
+ }
+ String everything = sb.toString();
+ TableClassifier table_classifier = new TableClassifier();
+ var res = table_classifier.classifyTable(everything);
+ System.out.println(res.toString());
+
+ } catch (Exception e) {
+ System.out.println(e.toString());
+ } finally {
+ br.close();
+ }
+ }
+}
diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java
new file mode 100644
index 0000000..05b4284
--- /dev/null
+++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java
@@ -0,0 +1,60 @@
+package webreduce.extension.classification;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import com.google.common.base.Optional;
+
+import webreduce.extraction.mh.tools.TableConvert;
+
+public class TableParser {
+ private TableConvert tableConverter;
+ private static final int TABLE_MIN_ROWS = 2;
+ private static final int TABLE_MIN_COLS = 2;
+ public TableParser() {
+ tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS);
+ }
+
+ public Element[][] parseTableHTML(String tableHTML) throws TableParsingException {
+ return parseTableHTML(tableHTML, true);
+ }
+
+ public Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException {
+ Document doc = Jsoup.parse(tableHTML);
+ Element table = doc.select("table").first();
+ if (table == null) {
+ throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification.");
+ }
+
+ Elements subTables = table.getElementsByTag("table");
+ subTables.remove(table);
+ if (subTables.size() > 0 && skipSubTables) {
+ throw new TableParsingSubTablesException(
+ "Failure, table includes sub-table(s). Skipping table classification.");
+ }
+
+ Optional convertedTable = tableConverter.toTable(table);
+ if (!convertedTable.isPresent()) {
+ throw new TableParsingException("toTable() failed. Skipping table classification.");
+ }
+
+ return convertedTable.get();
+ }
+
+ public class TableParsingException extends Exception {
+ private static final long serialVersionUID = 5471172109211007529L;
+
+ public TableParsingException(String errorMessage) {
+ super(errorMessage);
+ }
+ }
+
+ public class TableParsingSubTablesException extends TableParsingException {
+ private static final long serialVersionUID = -4415254026083906516L;
+
+ public TableParsingSubTablesException(String errorMessage) {
+ super(errorMessage);
+ }
+ }
+}
diff --git a/runtime_testing/__init__.py b/runtime_testing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/runtime_testing/resources/__init__.py b/runtime_testing/resources/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar b/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar
deleted file mode 100644
index d7b78f6..0000000
Binary files a/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar and /dev/null differ
diff --git a/runtime_testing/resources/table.html b/runtime_testing/resources/table.html
new file mode 100644
index 0000000..7be0e6c
--- /dev/null
+++ b/runtime_testing/resources/table.html
@@ -0,0 +1,67 @@
+
+
+
+ | Catchers |
+ Infielders |
+ Outfielders |
+ Rotation |
+ Bullpen |
+
+
+ | Gary Sanchez |
+ 1B Greg Bird |
+ LF Giancarlo Stanton |
+ Masahiro Tanaka |
+ CL Aroldis Chapman |
+
+
+ | Austin Romine |
+ 1B Luke Voit |
+ CF Brett Gardner |
+ James Paxton |
+ SU Zack Britton |
+
+
+ |
+ 2B Gleyber Torres |
+ RF Aaron Judge |
+ J.A. Happ |
+ SU Chad Green |
+
+
+ |
+ SS Troy Tulowitzki |
+ UTIL Tyler Wade |
+ Luis Cessa |
+ SU Adam Ottavino |
+
+
+ |
+ 3B Miguel Andujar |
+ |
+ |
+ MR Jonathan Holder |
+
+
+ |
+ IF DJ LeMahieu |
+ |
+ |
+ MR Tommy Kahnle |
+
+
+ |
+ |
+ |
+ |
+ MR Stephen Tarpley |
+
+
+ |
+ |
+ |
+ |
+ SWG Domingo German |
+
+
+
diff --git a/runtime_testing/utils/table_classifier.py b/runtime_testing/utils/table_classifier.py
new file mode 100644
index 0000000..e0dbe0f
--- /dev/null
+++ b/runtime_testing/utils/table_classifier.py
@@ -0,0 +1,17 @@
+import jnius_config
+import os
+jar_file_path = f'{os.path.dirname(os.path.realpath(__file__))}/../resources/dwtc-extension-1.0-jar-with-dependencies.jar'
+jnius_config.set_classpath(jar_file_path)
+
+from jnius import autoclass
+TableClassifier = autoclass('webreduce.extension.classification.TableClassifier')
+table_classifier = TableClassifier()
+
+
+def classify_table_2_phase(table_html):
+ return table_classifier.classifyTable(table_html)
+
+if __name__ == '__main__':
+ with open(f'{os.path.dirname(os.path.realpath(__file__))}/../resources/table.html') as f:
+ table_html = f.read()
+ print(classify_table_2_phase(table_html).tableType.toString())
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5d5bedd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,18 @@
+from setuptools import setup, find_packages
+
+setup(
+ name='web-table-classification',
+ version='1.0.0',
+ install_requires=['pyjnius'],
+ packages=find_packages(),
+ url='',
+ license='',
+ author='',
+ author_email='',
+ description='',
+# package_data={'web_table_classification': ['runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar']},
+ package_data={"": ["runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar"]},
+
+ include_package_data = True
+ #data_files=[('resources', ['runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar'])]
+)
diff --git a/web_table_classification/__init__.py b/web_table_classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web_table_classification/dwtc_table_classifier.py b/web_table_classification/dwtc_table_classifier.py
new file mode 100644
index 0000000..fbe88f8
--- /dev/null
+++ b/web_table_classification/dwtc_table_classifier.py
@@ -0,0 +1,65 @@
+import os
+import logging
+import urllib.request
+
+
+jar_file_path = f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar'
+if not os.path.isfile(jar_file_path):
+ logging.warning('JAR File missing, downloading')
+ urllib.request.urlretrieve(
+ "https://github.com/lavuy/web-table-classification/raw/c6cc1eeb62b996b8bbcd26b6dc27841d1464b884/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar",
+ jar_file_path)
+
+os.environ['CLASSPATH'] = jar_file_path
+
+global table_classifier
+table_classifier = None
+
+
+def _init_table_classifier():
+ # When running in multiprocess the JDK seems to use the parent's JDK thus causing a seg fault
+ from jnius import autoclass
+
+ TableClassifier = autoclass('webreduce.extension.classification.TableClassifier')
+ global table_classifier
+ table_classifier = TableClassifier()
+
+
+def classify_table_2_phase(table_html):
+ if table_classifier is None:
+ _init_table_classifier()
+ return table_classifier.classifyTable(table_html)
+
+
+def classify_and_print(table_html):
+ print(f'{os.getpid()}: {classify_table_2_phase(table_html).tableType.toString()}')
+ return classify_table_2_phase(table_html).tableType.toString()
+
+
+if __name__ == '__main__':
+ with open(f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/table.html') as f:
+ table_html = f.read()
+ print(classify_table_2_phase(table_html).tableType.toString())
+
+# import multiprocessing
+# from concurrent.futures import ProcessPoolExecutor
+# from datetime import datetime
+# if __name__ == '__main__':
+# multiprocessing.set_start_method("spawn", force=True)
+# start = datetime.now()
+# futures = []
+# with ProcessPoolExecutor(max_workers=1
+# ) as pool:
+# with open(f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/table.html') as f:
+# table_html = f.read()
+# for _ in range(1000):
+# futures.append(pool.submit(classify_and_print, table_html))
+# # futures.append(pool.submit(classify_table_2_phase, table_html))
+# # print(classify_table_2_phase(table_html).tableType.toString())
+# for future in futures:
+# res = future.result()
+# print(res)
+# assert res
+# # finished += 1
+# # R logging.info(f"Finished {finished}/{len(futures)} overall")
+# print(f"Took: {datetime.now() - start}")