diff --git a/Manifest.in b/Manifest.in new file mode 100644 index 0000000..4a5a926 --- /dev/null +++ b/Manifest.in @@ -0,0 +1 @@ +include runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dwtc-extension/.classpath b/dwtc-extension/.classpath index 9ba41a2..b3a4342 100644 --- a/dwtc-extension/.classpath +++ b/dwtc-extension/.classpath @@ -31,6 +31,19 @@ + + + + + + + + + + + + + diff --git a/dwtc-extension/.project b/dwtc-extension/.project index 3238494..0b5d123 100644 --- a/dwtc-extension/.project +++ b/dwtc-extension/.project @@ -22,12 +22,12 @@ - 1605118619462 + 1705582647115 30 org.eclipse.core.resources.regexFilterMatcher - node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ diff --git a/dwtc-extension/pom.xml b/dwtc-extension/pom.xml index ae5cb37..8e169f0 100644 --- a/dwtc-extension/pom.xml +++ b/dwtc-extension/pom.xml @@ -13,7 +13,7 @@ UTF-8 11 - 11 + 1.7 diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java index 2f86741..93dede7 100644 --- a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java +++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassificationPhase1.java @@ -1,37 +1,31 @@ package webreduce.extension.classification; - +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; import java.util.Arrays; - -import com.google.common.base.Optional; import com.google.inject.name.Named; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - import webreduce.data.TableType; import webreduce.extraction.mh.TableClassification; import webreduce.extraction.mh.features.FeaturesP1; import webreduce.extraction.mh.tools.ClassificationResult; -import webreduce.extraction.mh.tools.TableConvert; import weka.classifiers.Classifier; import weka.core.Attribute; import weka.core.Instance; +import webreduce.extension.classification.TableParser.TableParsingException; +import webreduce.extension.classification.TableParser.TableParsingSubTablesException; public class TableClassificationPhase1 { - private TableConvert tableConverter; private FeaturesP1 phase1Features; private Classifier classifier1; private Attribute classAttr1; private double layoutVal, relationVal; - private static final int TABLE_MIN_ROWS = 2; - private static final int TABLE_MIN_COLS = 2; + private TableParser tableParser; public TableClassificationPhase1(@Named("phase1ModelPath") String phase1ModelPath) { phase1Features = new FeaturesP1(); - tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS); + tableParser = new TableParser(); try { classifier1 = TableClassification.loadModelFromFile(phase1ModelPath); @@ -68,10 +62,10 @@ public ClassificationResult classifyTable(Element[][] convertedTable) { public ClassificationResult classifyTable(String tableHTML) { Element[][] table; try { - table = parseTableHTML(tableHTML); + table = tableParser.parseTableHTML(tableHTML); } catch (TableParsingSubTablesException e) { System.out.println(e.getMessage()); - return new ClassificationResult(TableType.LAYOUT, new double[]{}, null); + return new ClassificationResult(TableType.LAYOUT, new double[] {}, null); } catch (TableParsingException e) { System.out.println(e.getMessage()); return null; @@ -83,9 +77,9 @@ public double[] computeFeatures(String tableHTML) { Element[][] table; try { - table = parseTableHTML(tableHTML); + table = tableParser.parseTableHTML(tableHTML); } catch (TableParsingException e) { - System.out.println(e.getMessage()); + System.out.println(e.getMessage()); return null; } @@ -94,45 +88,27 @@ public double[] computeFeatures(String tableHTML) { return currentInst.toDoubleArray(); } - private Element[][] parseTableHTML(String tableHTML) throws TableParsingException { - return parseTableHTML(tableHTML, true); - } - - private Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException { - Document doc = Jsoup.parse(tableHTML); - Element table = doc.select("table").first(); - if (table == null) { - throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification."); - } - - Elements subTables = table.getElementsByTag("table"); - subTables.remove(table); - if (subTables.size() > 0 && skipSubTables) { - throw new TableParsingSubTablesException( - "Failure, table includes sub-table(s). Skipping table classification."); - } - - Optional convertedTable = tableConverter.toTable(table); - if (!convertedTable.isPresent()) { - throw new TableParsingException("toTable() failed. Skipping table classification."); - } - - return convertedTable.get(); - } - - public class TableParsingException extends Exception { - private static final long serialVersionUID = 5471172109211007529L; - - public TableParsingException(String errorMessage) { - super(errorMessage); - } - } - - public class TableParsingSubTablesException extends TableParsingException { - private static final long serialVersionUID = -4415254026083906516L; - - public TableParsingSubTablesException(String errorMessage) { - super(errorMessage); - } + public static void main(String[] args) throws IOException { + BufferedReader br = new BufferedReader( + new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html")); + try { + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + sb.append(line); + sb.append(System.lineSeparator()); + line = br.readLine(); + } + String everything = sb.toString(); + TableClassificationPhase1 classificationPhase1 = new TableClassificationPhase1("/Users/yuvalpeleg/projects/web-table-classification/runtime_testing/resources/RandomForest_P1.mdl"); + var res = classificationPhase1.classifyTable(everything); + System.out.println(res.toString()); + + } catch (Exception e) { + System.out.println(e.toString()); + } finally { + br.close(); } + } } diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java new file mode 100644 index 0000000..e62d36f --- /dev/null +++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableClassifier.java @@ -0,0 +1,63 @@ +package webreduce.extension.classification; + +import org.jsoup.nodes.Element; + +import webreduce.data.TableType; +import webreduce.extraction.mh.tools.ClassificationResult; +import webreduce.extraction.mh.TableClassification; +import webreduce.extension.classification.TableParser.TableParsingException; +import webreduce.extension.classification.TableParser.TableParsingSubTablesException; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +public class TableClassifier { + private TableParser tableParser; + private TableClassification tableClassification; + + public TableClassifier() { + tableParser = new TableParser(); + tableClassification = new TableClassification( + "/RandomForest_P1.mdl", + "/RandomForest_P2.mdl"); + } + + public ClassificationResult classifyTable(String tableHTML) { + Element[][] table; + try { + table = tableParser.parseTableHTML(tableHTML); + } catch (TableParsingSubTablesException e) { + System.out.println(e.getMessage()); + return new ClassificationResult(TableType.LAYOUT, new double[] {}, null); + } catch (TableParsingException e) { + System.out.println(e.getMessage()); + return null; + } + return tableClassification.classifyTable(table); + } + + public static void main(String[] args) throws IOException { + BufferedReader br = new BufferedReader( + new FileReader("/Users/yuvalpeleg/My Drive/Projects/JParser/tables/table.html")); + try { + TableClassification.loadModelFromClasspath("/RandomForest_P1.mdl"); + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + sb.append(line); + sb.append(System.lineSeparator()); + line = br.readLine(); + } + String everything = sb.toString(); + TableClassifier table_classifier = new TableClassifier(); + var res = table_classifier.classifyTable(everything); + System.out.println(res.toString()); + + } catch (Exception e) { + System.out.println(e.toString()); + } finally { + br.close(); + } + } +} diff --git a/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java b/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java new file mode 100644 index 0000000..05b4284 --- /dev/null +++ b/dwtc-extension/src/main/java/webreduce/extension/classification/TableParser.java @@ -0,0 +1,60 @@ +package webreduce.extension.classification; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.google.common.base.Optional; + +import webreduce.extraction.mh.tools.TableConvert; + +public class TableParser { + private TableConvert tableConverter; + private static final int TABLE_MIN_ROWS = 2; + private static final int TABLE_MIN_COLS = 2; + public TableParser() { + tableConverter = new TableConvert(TABLE_MIN_ROWS, TABLE_MIN_COLS); + } + + public Element[][] parseTableHTML(String tableHTML) throws TableParsingException { + return parseTableHTML(tableHTML, true); + } + + public Element[][] parseTableHTML(String tableHTML, boolean skipSubTables) throws TableParsingException { + Document doc = Jsoup.parse(tableHTML); + Element table = doc.select("table").first(); + if (table == null) { + throw new TableParsingException("Failure, no table was detected in HTML. Skipping table classification."); + } + + Elements subTables = table.getElementsByTag("table"); + subTables.remove(table); + if (subTables.size() > 0 && skipSubTables) { + throw new TableParsingSubTablesException( + "Failure, table includes sub-table(s). Skipping table classification."); + } + + Optional convertedTable = tableConverter.toTable(table); + if (!convertedTable.isPresent()) { + throw new TableParsingException("toTable() failed. Skipping table classification."); + } + + return convertedTable.get(); + } + + public class TableParsingException extends Exception { + private static final long serialVersionUID = 5471172109211007529L; + + public TableParsingException(String errorMessage) { + super(errorMessage); + } + } + + public class TableParsingSubTablesException extends TableParsingException { + private static final long serialVersionUID = -4415254026083906516L; + + public TableParsingSubTablesException(String errorMessage) { + super(errorMessage); + } + } +} diff --git a/runtime_testing/__init__.py b/runtime_testing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/runtime_testing/resources/__init__.py b/runtime_testing/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar b/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar deleted file mode 100644 index d7b78f6..0000000 Binary files a/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar and /dev/null differ diff --git a/runtime_testing/resources/table.html b/runtime_testing/resources/table.html new file mode 100644 index 0000000..7be0e6c --- /dev/null +++ b/runtime_testing/resources/table.html @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CatchersInfieldersOutfieldersRotationBullpen
Gary Sanchez1B Greg BirdLF Giancarlo StantonMasahiro TanakaCL Aroldis Chapman
Austin Romine1B Luke VoitCF Brett GardnerJames PaxtonSU Zack Britton
2B Gleyber TorresRF Aaron JudgeJ.A. HappSU Chad Green
SS Troy TulowitzkiUTIL Tyler WadeLuis CessaSU Adam Ottavino
3B Miguel AndujarMR Jonathan Holder
IF DJ LeMahieuMR Tommy Kahnle
MR Stephen Tarpley
SWG Domingo German
diff --git a/runtime_testing/utils/table_classifier.py b/runtime_testing/utils/table_classifier.py new file mode 100644 index 0000000..e0dbe0f --- /dev/null +++ b/runtime_testing/utils/table_classifier.py @@ -0,0 +1,17 @@ +import jnius_config +import os +jar_file_path = f'{os.path.dirname(os.path.realpath(__file__))}/../resources/dwtc-extension-1.0-jar-with-dependencies.jar' +jnius_config.set_classpath(jar_file_path) + +from jnius import autoclass +TableClassifier = autoclass('webreduce.extension.classification.TableClassifier') +table_classifier = TableClassifier() + + +def classify_table_2_phase(table_html): + return table_classifier.classifyTable(table_html) + +if __name__ == '__main__': + with open(f'{os.path.dirname(os.path.realpath(__file__))}/../resources/table.html') as f: + table_html = f.read() + print(classify_table_2_phase(table_html).tableType.toString()) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5d5bedd --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup, find_packages + +setup( + name='web-table-classification', + version='1.0.0', + install_requires=['pyjnius'], + packages=find_packages(), + url='', + license='', + author='', + author_email='', + description='', +# package_data={'web_table_classification': ['runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar']}, + package_data={"": ["runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar"]}, + + include_package_data = True + #data_files=[('resources', ['runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar'])] +) diff --git a/web_table_classification/__init__.py b/web_table_classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web_table_classification/dwtc_table_classifier.py b/web_table_classification/dwtc_table_classifier.py new file mode 100644 index 0000000..fbe88f8 --- /dev/null +++ b/web_table_classification/dwtc_table_classifier.py @@ -0,0 +1,65 @@ +import os +import logging +import urllib.request + + +jar_file_path = f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar' +if not os.path.isfile(jar_file_path): + logging.warning('JAR File missing, downloading') + urllib.request.urlretrieve( + "https://github.com/lavuy/web-table-classification/raw/c6cc1eeb62b996b8bbcd26b6dc27841d1464b884/runtime_testing/resources/dwtc-extension-1.0-jar-with-dependencies.jar", + jar_file_path) + +os.environ['CLASSPATH'] = jar_file_path + +global table_classifier +table_classifier = None + + +def _init_table_classifier(): + # When running in multiprocess the JDK seems to use the parent's JDK thus causing a seg fault + from jnius import autoclass + + TableClassifier = autoclass('webreduce.extension.classification.TableClassifier') + global table_classifier + table_classifier = TableClassifier() + + +def classify_table_2_phase(table_html): + if table_classifier is None: + _init_table_classifier() + return table_classifier.classifyTable(table_html) + + +def classify_and_print(table_html): + print(f'{os.getpid()}: {classify_table_2_phase(table_html).tableType.toString()}') + return classify_table_2_phase(table_html).tableType.toString() + + +if __name__ == '__main__': + with open(f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/table.html') as f: + table_html = f.read() + print(classify_table_2_phase(table_html).tableType.toString()) + +# import multiprocessing +# from concurrent.futures import ProcessPoolExecutor +# from datetime import datetime +# if __name__ == '__main__': +# multiprocessing.set_start_method("spawn", force=True) +# start = datetime.now() +# futures = [] +# with ProcessPoolExecutor(max_workers=1 +# ) as pool: +# with open(f'{os.path.dirname(os.path.realpath(__file__))}/../runtime_testing/resources/table.html') as f: +# table_html = f.read() +# for _ in range(1000): +# futures.append(pool.submit(classify_and_print, table_html)) +# # futures.append(pool.submit(classify_table_2_phase, table_html)) +# # print(classify_table_2_phase(table_html).tableType.toString()) +# for future in futures: +# res = future.result() +# print(res) +# assert res +# # finished += 1 +# # R logging.info(f"Finished {finished}/{len(futures)} overall") +# print(f"Took: {datetime.now() - start}")