From eae548afa016b9da59d224bee798db4569714081 Mon Sep 17 00:00:00 2001 From: sillygoose Date: Tue, 2 Dec 2025 13:34:59 -0800 Subject: [PATCH 1/2] Add structured output export to CSV and JSON --- data/labels.json | 687 +-------------------------- scripts/full_pipeline.py | 191 ++++++-- src/model/models/pdf_classifier.json | 1 + src/model/pdf_classifier.py | 119 ++++- src/output/__init__ | 17 + src/output/structured_output.py | 317 ++++++++++++ tests/test_structured_output.py | 319 +++++++++++++ 7 files changed, 928 insertions(+), 723 deletions(-) create mode 100644 src/model/models/pdf_classifier.json create mode 100644 src/output/__init__ create mode 100644 src/output/structured_output.py create mode 100644 tests/test_structured_output.py diff --git a/data/labels.json b/data/labels.json index 8c076c1..c25866b 100644 --- a/data/labels.json +++ b/data/labels.json @@ -1,669 +1,22 @@ { - "Abe_1989.txt": "useful", - "Adams_1989.txt": "useful", - "Agapov_1938.txt": "useful", - "Agudelo-Cantero_2015.txt": "useful", - "Aiken_1998.txt": "useful", - "Alcaraz_2015.txt": "useful", - "Alexander_1986.txt": "useful", - "Ali_2016.txt": "useful", - "Allison_2013.txt": "useful", - "Alterio_1997.txt": "useful", - "Altin_2015.txt": "useful", - "Andersone_2004.txt": "useful", - "Anderson_1999.txt": "useful", - "Andreone_1999.txt": "useful", - "Angelici_2005.txt": "useful", - "Angelici_2014.txt": "useful", - "Annett_1984.txt": "useful", - "Ansell_1999.txt": "useful", - "Antonelis_1994.txt": "useful", - "Archer_2004.txt": "useful", - "Arendt_2001.txt": "useful", - "Arrington_2002.txt": "useful", - "Aykanat_2020.txt": "useful", - "Baeta_2013.txt": "useful", - "Baghli_2002.txt": "useful", - "Bajkov_1934.txt": "useful", - "Bakaloudis_2012.txt": "useful", - "Bakus_1959.txt": "useful", - "Ballesteros_2009.txt": "useful", - "Bandeira_2017.txt": "useful", - "Baranovskaya_1935.txt": "useful", - "Barbini_2016.txt": "useful", - "Barbosa_2015.txt": "useful", - "Barrett_2012.txt": "useful", - "Baruah_2001.txt": "useful", - "Battaglia_2014.txt": "useful", - "Battle_1998.txt": "useful", - "Beard_2007.txt": "useful", - "Beauchamp_1990.txt": "useful", - "Beeton_1956.txt": "useful", - "Behn_2019.txt": "useful", - "Bengtson_1968.txt": "useful", - "Bengtsson_2023.txt": "useful", - "Berazategui_2007.txt": "useful", - "Berchtold_2015.txt": "useful", - "Berg_2002.txt": "useful", - "Berry_1981.txt": "useful", - "Best_1984.txt": "useful", - "Best_1986.txt": "useful", - "Best_1987.txt": "useful", - "Birkeland_1974.txt": "useful", - "Birkeland_1982.txt": "useful", - "Bisa_2007.txt": "useful", - "Blackburn_2006.txt": "useful", - "Blanco-Parra_2012.txt": "useful", - "Blanc_1998.txt": "useful", - "Boelter_2007.txt": "useful", - "Boelter_2012.txt": "useful", - "Bojsen_2005.txt": "useful", - "Bolek_1997.txt": "useful", - "Bonham_1941.txt": "useful", - "Borcherding_2019.txt": "useful", - "Bothma_1966a.txt": "useful", - "Bothma_1966b.txt": "useful", - "Bothma_1971.txt": "useful", - "Bourque_2018.txt": "useful", - "Bradstreet_1982.txt": "useful", - "Brandao_2003.txt": "useful", - "Braune_1987.txt": "useful", - "Brewer_1989.txt": "useful", - "Brito_2013.txt": "useful", - "Brito_2015.txt": "useful", - "Brun_1972.txt": "useful", - "Buckley_1997.txt": "useful", - "Burbank_2022.txt": "useful", - "Burghart_2010.txt": "useful", - "Bury_1973.txt": "useful", - "Bustnes_2001.txt": "useful", - "Bwong_2010.txt": "useful", - "Cabeceira_2015.txt": "useful", - "Cada_1987.txt": "useful", - "Caldart_2012.txt": "useful", - "Camilo_1993.txt": "useful", - "Camino_2023.txt": "useful", - "Carey_1972.txt": "useful", - "Carmona-Antonanzas_2016.txt": "useful", - "Caron_2004.txt": "useful", - "Carpenter_1952.txt": "useful", - "Carss_1993.txt": "useful", - "Cartes_1993.txt": "useful", - "Casali_2023.txt": "useful", - "Casta\u00f1eda_2006.txt": "useful", - "Castilla_1991.txt": "useful", - "Castriota_2007.txt": "useful", - "Castro_1989.txt": "useful", - "Castro_1990.txt": "useful", - "Castro_2016.txt": "useful", - "Catling_1988.txt": "useful", - "Caut_2013.txt": "useful", - "Cecala_2007.txt": "useful", - "Cerda_1993.txt": "useful", - "Chambellant_2013.txt": "useful", - "Chelotti_2022.txt": "useful", - "Cherel_2003.txt": "useful", - "Cherel_2004.txt": "useful", - "Chintiroglou_1992.txt": "useful", - "Chou_1995.txt": "useful", - "Christiansen_2012.txt": "useful", - "Clady_1974.txt": "useful", - "Clarke_1996.txt": "useful", - "Coco_2014.txt": "useful", - "Coelho_1997.txt": "useful", - "Collard_1970.txt": "useful", - "Condit_1984.txt": "useful", - "Connell_1961.txt": "useful", - "Connell_1970.txt": "useful", - "Copson_1986.txt": "useful", - "Cordone_2022.txt": "useful", - "Costa_2014.txt": "useful", - "Crabtree_1998.txt": "useful", - "Crawford_2009.txt": "useful", - "Cressey_1970.txt": "useful", - "Crnobrnja-Isailovi\u0107_2012.txt": "useful", - "Croxall_1988.txt": "useful", - "Custer_1996.txt": "useful", - "Cutter_1958.txt": "useful", - "Dale_2011.txt": "useful", - "Dayton_1977.txt": "useful", - "deAguiar_2004.txt": "useful", - "Dearborn_1977.txt": "useful", - "Dearborn_1986.txt": "useful", - "Dearborn_1991.txt": "useful", - "Dearborn_1996.txt": "useful", - "Dehn_2006.txt": "useful", - "deJuan_2007.txt": "useful", - "Delany_1990.txt": "useful", - "deOliveiraNeves_2014.txt": "useful", - "DeWitt_1972.txt": "useful", - "Djait_2019.txt": "useful", - "Dominguez_2000.txt": "useful", - "Downie_2010.txt": "useful", - "Drygala_2013.txt": "useful", - "Duggins_1983.txt": "useful", - "Dur\u00e9_2001.txt": "useful", - "Eichelbaum_1909.txt": "useful", - "Elrod_1981.txt": "useful", - "Eriksen_2021.txt": "useful", - "Espinoza_2015.txt": "useful", - "Evans_1996.txt": "useful", - "Fagade_1973.txt": "useful", - "Fairweather_1985.txt": "useful", - "Falke_2024.txt": "useful", - "Falk_1992.txt": "useful", - "Falk_1993.txt": "useful", - "Fanelli_2004.txt": "useful", - "Fay_1987.txt": "useful", - "Feder_1959.txt": "useful", - "Feigenbaum_1979.txt": "useful", - "Felix_2006.txt": "useful", - "Fenchel_1965.txt": "useful", - "Fernandes_2024.txt": "useful", - "Ferrari_1989.txt": "useful", - "Ferreira_2012.txt": "useful", - "Ferreira_2015.txt": "useful", - "Fevolden_1982.txt": "useful", - "Finley_1983.txt": "useful", - "Finley_1990.txt": "useful", - "Fisher_2008.txt": "useful", - "Fisk_2002.txt": "useful", - "Fitz_1991.txt": "useful", - "Fleharty_1967.txt": "useful", - "Flores_2008.txt": "useful", - "Force_1935.txt": "useful", - "Ford_1998.txt": "useful", - "Frantz_1970.txt": "useful", - "Fraser_1970.txt": "useful", - "Fratt_1984.txt": "useful", - "Freire_1995.txt": "useful", - "Freire_1996.txt": "useful", - "Fritz_1974.txt": "useful", - "Froneman_1998.txt": "useful", - "Fujinami_2018.txt": "useful", - "Fu_2013.txt": "useful", - "Gaborieau_2004.txt": "useful", - "Gales_1988.txt": "useful", - "Gales_1989.txt": "useful", - "Gales_1990.txt": "useful", - "Gales_1992.txt": "useful", - "Gale_2011.txt": "useful", - "Ganmanee_2003.txt": "useful", - "Garcia_1988.txt": "useful", - "Garda_2007.txt": "useful", - "Gaymer_2001.txt": "useful", - "Gaymer_2008.txt": "useful", - "Gelsleichter_1998.txt": "useful", - "Gibbs_2011.txt": "useful", - "Gil_2007.txt": "useful", - "Gil_2008.txt": "useful", - "Giuntoli_1978.txt": "useful", - "Glaudas_2008.txt": "useful", - "Glorioso_2010.txt": "useful", - "Godfrey_1980.txt": "useful", - "Gong_2023.txt": "useful", - "Gonzalez_1994.txt": "useful", - "Gorbatenko_2009.txt": "useful", - "Graham_2001.txt": "useful", - "Grainger_2020.txt": "useful", - "Gray_1994.txt": "useful", - "Greenstone_1983.txt": "useful", - "Gregory_1978.txt": "useful", - "Gregory_1991.txt": "useful", - "Gregory_2013.txt": "useful", - "Guillemette_1992.txt": "useful", - "Gunther_2013.txt": "useful", - "Gunzburger_1999.txt": "useful", - "Guseinov_2004.txt": "useful", - "Hales_2008.txt": "useful", - "Hamilton_1951.txt": "useful", - "Hamilton_1956.txt": "useful", - "Hantak_2016.txt": "useful", - "Hardy_2006.txt": "useful", - "Harris_2009.txt": "useful", - "Hayes_1985.txt": "useful", - "Heithaus_2001.txt": "useful", - "Henderson_1982.txt": "useful", - "Heng_2018.txt": "useful", - "Henschel_1994.txt": "useful", - "Hesse_2025.txt": "useful", - "Himmelman_1991.txt": "useful", - "Hindell_1988a.txt": "useful", - "Hindell_1988b.txt": "useful", - "Hirai_1999.txt": "useful", - "Hirai_2000.txt": "useful", - "Hirai_2002.txt": "useful", - "Hirai_2004.txt": "useful", - "Hirschfeld_2011.txt": "useful", - "Hockman_1983.txt": "useful", - "Holohan_1998.txt": "useful", - "Hop_1992.txt": "useful", - "Houston_1993.txt": "useful", - "Howell_2003.txt": "useful", - "Hromada_2003.txt": "useful", - "Hsueh_1992.txt": "useful", - "Huckembeck_2014.txt": "useful", - "Huey_2001.txt": "useful", - "Humphries_1992.txt": "useful", - "Huseynov_2005.txt": "useful", - "Hutton_1987.txt": "useful", - "Insley_2021.txt": "useful", - "Ito_2009.txt": "useful", - "Iverson_2024.txt": "useful", - "Jacquemin_2014.txt": "useful", - "Jarv_2011.txt": "useful", - "Jenson_1960.txt": "useful", - "Jeong-Chae_2020.txt": "useful", - "Jewett_1982.txt": "useful", - "Jewett_1983.txt": "useful", - "Jillson_1981.txt": "useful", - "Johnson_2015.txt": "useful", - "Joyce_2002.txt": "useful", - "Jude_1973.txt": "useful", - "Juinio_1992.txt": "useful", - "Jurajda_2016.txt": "useful", - "Kadri_2012.txt": "useful", - "Kadye_2012.txt": "useful", - "Kalogianni_2010.txt": "useful", - "Kam_1998.txt": "useful", - "Kangur_1998.txt": "useful", - "Kehayias_1996.txt": "useful", - "Keough_1979.txt": "useful", - "Kephart_1982.txt": "useful", - "Keppeler_2013.txt": "useful", - "Kerle_2000.txt": "useful", - "Keskinen_2004.txt": "useful", - "Khan_2013.txt": "useful", - "Khan_2014.txt": "useful", - "Kidawa_2011.txt": "useful", - "Kidera_2008.txt": "useful", - "Killengreen_2011.txt": "useful", - "Kimmerer_1984.txt": "useful", - "King_1982.txt": "useful", - "King_2005.txt": "useful", - "Klages_1990.txt": "useful", - "Klimstra_1959a.txt": "useful", - "Klimstra_1959b.txt": "useful", - "Kloock_2001.txt": "useful", - "Knickle_2013.txt": "useful", - "Kock_1994.txt": "useful", - "Kofron_1978.txt": "useful", - "Kolb_1979.txt": "useful", - "Kopecky_2011.txt": "useful", - "Kopecky_2012.txt": "useful", - "Kopecky_2015.txt": "useful", - "Kreiling_2021.txt": "useful", - "Krupa_2002.txt": "useful", - "Langford_1941.txt": "useful", - "Lanszki_2007.txt": "useful", - "Lanszki_2015.txt": "useful", - "Laptikhovsky_2014.txt": "useful", - "Laufer_2021.txt": "useful", - "Laur\u00eda-Manzano_2014.txt": "useful", - "LeBrasseur_1966.txt": "useful", - "Lefebvre_1990.txt": "useful", - "Leivas_2012.txt": "useful", - "Lemos_2015.txt": "useful", - "Leonard_1942.txt": "useful", - "Leon_2004.txt": "useful", - "Lever_1959.txt": "useful", - "Lewis_2014.txt": "useful", - "Lima_2010.txt": "useful", - "Lima_2022.txt": "useful", - "Lin_2008.txt": "useful", - "Lisboa_2012.txt": "useful", - "Liu_2015.txt": "useful", - "Li_2015.txt": "useful", - "Loh_2011.txt": "useful", - "Longhurst_1957.txt": "useful", - "Lonne_1992.txt": "useful", - "Louda_1979.txt": "useful", - "Lucifora_2005.txt": "useful", - "Lugton_1993.txt": "useful", - "Luiselli_2004.txt": "useful", - "Luiselli_2006.txt": "useful", - "Luna_2022.txt": "useful", - "Lydersen_1991.txt": "useful", - "Lynch_1985.txt": "useful", - "L\u00f3pez_2010.txt": "useful", - "Macale_2008.txt": "useful", - "Macartney_1989.txt": "useful", - "Maciel_2012.txt": "useful", - "Macy_1982.txt": "useful", - "Maeda_2004.txt": "useful", - "Magnusdottir_2012.txt": "useful", - "Mahan_2007.txt": "useful", - "Maia-Carneiro_2012.txt": "useful", - "Maia_2006.txt": "useful", - "Main_2011.txt": "useful", - "Majaneva_2013.txt": "useful", - "Marais_1984.txt": "useful", - "Marques_2015.txt": "useful", - "MarquezVelasquez_2019.txt": "useful", - "Martin_1996.txt": "useful", - "Maser_1983.txt": "useful", - "Mauzey_1966.txt": "useful", - "Mauzey_1968.txt": "useful", - "Mazzotti_2020.txt": "useful", - "McClintock_1985.txt": "useful", - "McCluskey_2021.txt": "useful", - "McDermott_1965.txt": "useful", - "McDermott_1987.txt": "useful", - "McDonald_2000.txt": "useful", - "McElroy_2006.txt": "useful", - "McWilliams_1989.txt": "useful", - "Measey_2004.txt": "useful", - "Meckstroth_2007.txt": "useful", - "Megina_2001.txt": "useful", - "Meheust_2015.txt": "useful", - "Meinzer_1975.txt": "useful", - "Meise_2003.txt": "useful", - "Mendonca_2009.txt": "useful", - "Menge_1972.txt": "useful", - "Menge_1974a.txt": "useful", - "Menge_1974b.txt": "useful", - "Mercier_2011.txt": "useful", - "Mesa_2015.txt": "useful", - "Metillo_2011.txt": "useful", - "Mikkelsen_2002.txt": "useful", - "Milanovich_2008.txt": "useful", - "Miller_1990.txt": "useful", - "Miller_1995.txt": "useful", - "Minello_1989.txt": "useful", - "Mitchell_1953.txt": "useful", - "Moku_2000.txt": "useful", - "Moncada-Rosas_2025.txt": "useful", - "Montague_1988.txt": "useful", - "Montano_2017.txt": "useful", - "Moreira_2014.txt": "useful", - "Moreno-Leon_2009.txt": "useful", - "Muenz_2008.txt": "useful", - "Murie_1992.txt": "useful", - "Murphy_1992.txt": "useful", - "Murphy_1995.txt": "useful", - "Murphy_2004.txt": "useful", - "Murphy_2008.txt": "useful", - "Mustamaki_2014.txt": "useful", - "Mutlu_1999.txt": "useful", - "Myhre_1975.txt": "useful", - "Nack_2015.txt": "useful", - "Nagorsen_1989.txt": "useful", - "Nakagawa_2023.txt": "useful", - "Nakano_2016.txt": "useful", - "Navarrete_2008.txt": "useful", - "Nentwig_1985.txt": "useful", - "Neves_2009.txt": "useful", - "Newman_2010.txt": "useful", - "Ngo_2014.txt": "useful", - "Ngo_2014a.txt": "useful", - "Ngo_2014b.txt": "useful", - "Nickels_2023.txt": "useful", - "Nielsen_2019.txt": "useful", - "Niethammer_1992.txt": "useful", - "Nilssen_1995.txt": "useful", - "Noda_1992.txt": "useful", - "Nogueira-Junior_2008.txt": "useful", - "Norman_1992.txt": "useful", - "Novak_2010.txt": "useful", - "Novak_2013.txt": "useful", - "Novak_2017.txt": "useful", - "Oh_2001.txt": "useful", - "Ojeda_1991.txt": "useful", - "Olson_2012.txt": "useful", - "Oosten_1938.txt": "useful", - "Opacak_2004.txt": "useful", - "Ordiano-Flores_2024.txt": "useful", - "Ordzie_1980.txt": "useful", - "Orejas_2001.txt": "useful", - "Orejas_2013.txt": "useful", - "Oresland_2000.txt": "useful", - "Oskarsson_2016.txt": "useful", - "Ottenbacher_1994.txt": "useful", - "Ovaska_1991.txt": "useful", - "Ozaki_2023.txt": "useful", - "Pages_1997.txt": "useful", - "Paine_1969.txt": "useful", - "Pakhomov_1998.txt": "useful", - "Palmer_1988.txt": "useful", - "Paltridge_1997.txt": "useful", - "Papageorgiou_1994.txt": "useful", - "Pardo-Gandarillas_2014.txt": "useful", - "Parker_1986.txt": "useful", - "Parker_1994.txt": "useful", - "Paulissen_1987.txt": "useful", - "Paul_1975.txt": "useful", - "Pearre_1973.txt": "useful", - "Pereira_2018.txt": "useful", - "Pereze-Bote_2006.txt": "useful", - "Phillips_1991.txt": "useful", - "Phillips_2003.txt": "useful", - "Pierre_2024.txt": "useful", - "Pietersen_2010.txt": "useful", - "Pine_2005.txt": "useful", - "Pinkas_1970.txt": "useful", - "Piontek_2015.txt": "useful", - "Pizzatto_2008.txt": "useful", - "Plotz_1991.txt": "useful", - "Plummer_1981.txt": "useful", - "Plummer_1984.txt": "useful", - "Plyuscheva_2010.txt": "useful", - "Polis_1979.txt": "useful", - "Polymeni_2011.txt": "useful", - "Pomerleau_2011.txt": "useful", - "Poole_1996.txt": "useful", - "Portner_2022.txt": "useful", - "Pothoven_2015.txt": "useful", - "Potier_2007.txt": "useful", - "Preston_2012.txt": "useful", - "Preti_2001.txt": "useful", - "Preti_2004.txt": "useful", - "Preti_2008.txt": "useful", - "Preti_2012.txt": "useful", - "Preti_2020.txt": "useful", - "Preti_2023.txt": "useful", - "Prokopchuk_2006.txt": "useful", - "Pueta_2013.txt": "useful", - "Purcell_1981a.txt": "useful", - "Purcell_1981b.txt": "useful", - "Purcell_1982.txt": "useful", - "Purcell_2010.txt": "useful", - "Purdy_2004.txt": "useful", - "Quesada_2014.txt": "useful", - "Quirino_2015.txt": "useful", - "Randall_1967.txt": "useful", - "Raney_1947.txt": "useful", - "Rauschenplat_1901.txt": "useful", - "Read_2001.txt": "useful", - "Rebou\u00e7as_2013.txt": "useful", - "Reid_1956.txt": "useful", - "Reis_1996.txt": "useful", - "Reis_2020.txt": "useful", - "Roberts_2003.txt": "useful", - "Robert_1997.txt": "useful", - "Robilliard_1971.txt": "useful", - "Robinson_2015.txt": "useful", - "Rocha_1994.txt": "useful", - "Rodger_2021.txt": "useful", - "Rodrigues_2019.txt": "useful", - "Rodr\u00edguez-Garc\u00eda_2024.txt": "useful", - "Rodr\u00edguez-Robles_1999.txt": "useful", - "Rohner_2013.txt": "useful", - "Romanov_2008.txt": "useful", - "Romero_2011.txt": "useful", - "Rorbaek_2023.txt": "useful", - "Rosas-Luis_2015.txt": "useful", - "Rosenthal_1972.txt": "useful", - "Rose_2015.txt": "useful", - "Rudstam_1992.txt": "useful", - "Rysava-Novakova_2009.txt": "useful", - "R\u00edo-Garc\u00eda_2014.txt": "useful", - "Sallami_2015.txt": "useful", - "Salvidio_1999.txt": "useful", - "Santander-Neto_2021.txt": "useful", - "Santic_2012.txt": "useful", - "Santic_2021.txt": "useful", - "Santos_1999.txt": "useful", - "Sant_Anna_2015.txt": "useful", - "Sapounidis_2015.txt": "useful", - "Sato_2005.txt": "useful", - "Saunders_2015.txt": "useful", - "Saunders_2015b.txt": "useful", - "Savage_1967.txt": "useful", - "Schacht_1995.txt": "useful", - "Scheffer_1931.txt": "useful", - "Schubert_1986.txt": "useful", - "Scolardi_2006.txt": "useful", - "Scott_1903(1).txt": "useful", - "Scott_1903.txt": "useful", - "Sekiguchi_1992.txt": "useful", - "Selleslagh_2015.txt": "useful", - "Setyobudi_2024.txt": "useful", - "Shaiek_2015.txt": "useful", - "Shine_1986.txt": "useful", - "Shivji_1983.txt": "useful", - "Shufeldt_1887.txt": "useful", - "Siferd_1992.txt": "useful", - "Simila_2022.txt": "useful", - "Sinclaire_1992.txt": "useful", - "Siqueira_2006.txt": "useful", - "Sleeman_1992.txt": "useful", - "Slip_1995.txt": "useful", - "Sloan_1981.txt": "useful", - "Sluyus_2001.txt": "useful", - "Smith_1991.txt": "useful", - "Smith_2005.txt": "useful", - "Smith_2011.txt": "useful", - "Smuts_1979.txt": "useful", - "Soekoe_2022.txt": "useful", - "Sole_2009.txt": "useful", - "Sole_2018.txt": "useful", - "Soupir_2000.txt": "useful", - "Sousa_2015.txt": "useful", - "Southern_1941.txt": "useful", - "Spadinger_1999.txt": "useful", - "Spalding_1971.txt": "useful", - "Sprules_1945.txt": "useful", - "Steele_1986.txt": "useful", - "Steinarsd\u00f3ttir_2009.txt": "useful", - "Stewart_2005.txt": "useful", - "Strain_2014.txt": "useful", - "Stuart_1991.txt": "useful", - "Sunderland_1975.txt": "useful", - "Surface_1906.txt": "useful", - "Szczepanski_2014(1).txt": "useful", - "Szczepanski_2014.txt": "useful", - "S\u00f3lmundsson_2025.txt": "useful", - "Takahashi_1998.txt": "useful", - "Takeuchi_2019(1).txt": "useful", - "Takeuchi_2019.txt": "useful", - "Taniguchi_2005.txt": "useful", - "Taylor_1995.txt": "useful", - "Teixeira_2004.txt": "useful", - "Thompson_2009.txt": "useful", - "Thut_1970.txt": "useful", - "Tickell_2021.txt": "useful", - "Tidemann_1994.txt": "useful", - "TinHan_2021.txt": "useful", - "Tokeshi_1989.txt": "useful", - "Tonay_2016.txt": "useful", - "Tong_1986.txt": "useful", - "Tonnesson_2005.txt": "useful", - "Torres-Rojas_2009.txt": "useful", - "Townhill_2021.txt": "useful", - "Town_1980.txt": "useful", - "Tremblay-Gagnon_2023.txt": "useful", - "Tuttle_2009.txt": "useful", - "Uhler_1939.txt": "useful", - "Ulloa_2006.txt": "useful", - "Usman_2018.txt": "useful", - "Valderrama-Vernaza_2009.txt": "useful", - "Valdmann_1998.txt": "useful", - "Valdmann_2005.txt": "useful", - "Valls_2015.txt": "useful", - "Vanegas_2016.txt": "useful", - "VanHeezik_1990.txt": "useful", - "VanHyning_1932.txt": "useful", - "Varghese_2014.txt": "useful", - "Viana_2014.txt": "useful", - "Voris_1980.txt": "useful", - "Vrcibradic_2009.txt": "useful", - "Wallace_1990.txt": "useful", - "Wang_2012.txt": "useful", - "Wang_2023.txt": "useful", - "Ward_1988.txt": "useful", - "Washburn_2013.txt": "useful", - "Watanabe_2004.txt": "useful", - "Wear_1987.txt": "useful", - "Weber_1989.txt": "useful", - "Webster_1943.txt": "useful", - "Weidel_2000.txt": "useful", - "Wells_1961.txt": "useful", - "Wen_2012.txt": "useful", - "West_1986.txt": "useful", - "West_1988.txt": "useful", - "Whiles_2004.txt": "useful", - "White_1992.txt": "useful", - "Whitney_2024.txt": "useful", - "Wieters_2000.txt": "useful", - "Wilcox_2015.txt": "useful", - "Witman_2010.txt": "useful", - "Wood_1965.txt": "useful", - "Wright_1915.txt": "useful", - "Wu_2005.txt": "useful", - "Xavier_2002.txt": "useful", - "Xavier_2010.txt": "useful", - "Yamaguchi_2011.txt": "useful", - "Yang_2011.txt": "useful", - "Yatabe_2010.txt": "useful", - "Young_1986.txt": "useful", - "Zhou_2004.txt": "useful", - "Zunnna_2009.txt": "useful", - "Ainley_2005.txt": "not useful", - "AlejoPlata_2019.txt": "not useful", - "Allen_1942.txt": "not useful", - "Andersen_2021.txt": "not useful", - "Aznar_1994.txt": "not useful", - "Bajkov_1930.txt": "not useful", - "Balcombe_2005.txt": "not useful", - "Barry_1996.txt": "not useful", - "Belon_1555.txt": "not useful", - "Boesel_1938.txt": "not useful", - "Carol_2009.txt": "not useful", - "Chacko_1949.txt": "not useful", - "Choi_2022.txt": "not useful", - "Cormack_2023.txt": "not useful", - "Decker_2025.txt": "not useful", - "DiBeneditto_2014.txt": "not useful", - "Dymond_1929.txt": "not useful", - "Elwood_1969.txt": "not useful", - "Feng_2023.txt": "not useful", - "Ferreira_1999.txt": "not useful", - "Fries_1892.txt": "not useful", - "Gamez_2022.txt": "not useful", - "Hahn_2014.txt": "not useful", - "Islam_2018.txt": "not useful", - "Leonard_1940.txt": "not useful", - "Leonard_1949.txt": "not useful", - "Leuchtenberger_2020.txt": "not useful", - "Long_2000.txt": "not useful", - "Lowndess_1935.txt": "not useful", - "Martin_1995.txt": "not useful", - "McMahon_1999.txt": "not useful", - "Monadjem_1996.txt": "not useful", - "Ng_2021.txt": "not useful", - "Ohizumi_2003.txt": "not useful", - "Pereira_2017.txt": "not useful", - "Portner_2025.txt": "not useful", - "Pritchard_1944.txt": "not useful", - "Rosalino_2009.txt": "not useful", - "Simone_2022.txt": "not useful", - "Teillard_2024.txt": "not useful", - "Troina_2016.txt": "not useful", - "Uyeno_1991.txt": "not useful", - "Verrill_1871.txt": "not useful", - "Wright_1927.txt": "not useful", - "Wurtsbaugh_1975.txt": "not useful", - "Yoshino_2020.txt": "not useful", - "Yu_2022.txt": "not useful", - "Zheltenkova_1938.txt": "not useful" + "Adams_1989.txt": "useful", + "Berg_2002.txt": "useful", + "Dale_2011.txt": "useful", + "Fisher_2008.txt": "useful", + "Harris_2009.txt": "useful", + "Kerle_2000.txt": "useful", + "Marques_2015.txt": "useful", + "Pakhomov_1998.txt": "useful", + "Sousa_2015.txt": "useful", + "Wu_2005.txt": "useful", + "AlejoPlata_2019.txt": "not-useful", + "Decker_2025.txt": "not-useful", + "Ferreira_1999.txt": "not-useful", + "Gamez_2022.txt": "not-useful", + "Long_2000.txt": "not-useful", + "Ng_2021.txt": "not-useful", + "Ohizumi_2003.txt": "not-useful", + "Rosalino_2009.txt": "not-useful", + "Simone_2022.txt": "not-useful", + "Yoshino_2020.txt": "not-useful" } \ No newline at end of file diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index 95c2de2..7651a54 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -2,7 +2,7 @@ Modes: - API mode: Download PDFs from Google Drive and process them - - Local mode: Use PDFs already downloaded locally + - Local mode: Use PDFs already downloaded locally (DEFAULT) API Mode Environment variables: - GOOGLE_SERVICE_ACCOUNT_JSON (service account JSON string) @@ -10,14 +10,15 @@ - GOOGLE_DRIVE_USE_SHARED_DRIVE=true (if using shared drives / shared folders) Usage: + - Default (local): python full_pipeline.py - API mode: python full_pipeline.py --api - - Local mode: python full_pipeline.py --local + - Custom path: python full_pipeline.py --local C:\\path\\to\\data Behavior: - - API mode: Streams every PDF (no local PDF persistence) and writes extracted text to data/processed-text. - - Local mode: Processes PDFs from specified local directory (expects 'useful' and 'not-useful' subfolders). - - Generates labels.json based on folder origin. - - Trains model with src/model/train_model.py. + - Processes PDFs from data/useful and data/not-useful folders + - Generates labels.json based on folder origin + - Trains model with src/model/train_model.py + - Automatically generates classification results (CSV & JSON) """ from __future__ import annotations @@ -30,22 +31,31 @@ import subprocess import sys -import sys -from pathlib import Path as _Path2 - -sys.path.append(str(_Path2(__file__).resolve().parents[1])) # add repo root to sys.path +# Project setup - MUST be before other imports +PROJECT_ROOT = Path(__file__).resolve().parent.parent +os.chdir(PROJECT_ROOT) +sys.path.insert(0, str(PROJECT_ROOT)) -from scripts.env_loader import load_env +# Load environment (for API mode) +try: + from scripts.env_loader import load_env + load_env() +except: + pass -load_env() # Load .env file if present (for local dev) +# Import Google Drive modules (only needed for API mode) +try: + from scripts.google_drive.drive_io import ( + get_drive_service, + find_child_folder_id, + list_pdfs_in_folder, + download_file_bytes, + sanitize_filename, + ) + GOOGLE_DRIVE_AVAILABLE = True +except ImportError: + GOOGLE_DRIVE_AVAILABLE = False -from scripts.google_drive.drive_io import ( - get_drive_service, - find_child_folder_id, - list_pdfs_in_folder, - download_file_bytes, - sanitize_filename, -) from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes @@ -63,10 +73,15 @@ def write_labels(labels: Dict[str, str], output_file: Path): def process_api_mode(): - """Download PDFs from Google Drive and process them.""" - root_id = os.environ.get("GOOGLE_DRIVE_ROOT_FOLDER_ID") + #Download PDFs from Google Drive and process them + if not GOOGLE_DRIVE_AVAILABLE: + print("ERROR: Google Drive modules not available.") + print("Please install: pip install google-auth google-api-python-client") + return False + + root_id = os.environ.get("GOOGLE_DRIVE_FOLDER_ID") if not root_id: - raise RuntimeError("Missing GOOGLE_DRIVE_ROOT_FOLDER_ID environment variable") + raise RuntimeError("Missing GOOGLE_DRIVE_FOLDER_ID environment variable") service = get_drive_service() useful_id = find_child_folder_id(service, root_id, "useful") @@ -76,10 +91,11 @@ def process_api_mode(): if not not_useful_id: raise RuntimeError(f"Could not find 'not-useful' subfolder under root folder {root_id}") - out_dir = Path("data/processed-text") + out_dir = PROJECT_ROOT / "data" / "processed-text" out_dir.mkdir(parents=True, exist_ok=True) labels: Dict[str, str] = {} - count=1 + count = 1 + for folder_id, label in [(useful_id, "useful"), (not_useful_id, "not-useful")]: files = list_pdfs_in_folder(service, folder_id, max_files=None) print(f"Found {len(files)} PDFs in folder label '{label}'") @@ -91,14 +107,15 @@ def process_api_mode(): (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label print(f"{count} Processed {f['name']}") - count+=1 + count += 1 - write_labels(labels, Path("data/labels.json")) + write_labels(labels, PROJECT_ROOT / "data" / "labels.json") print(f"Wrote {len(labels)} labeled text files.") + return True def process_local_mode(data_path: Path): - """Process PDFs from local directory.""" + #Process PDFs from local directory if not data_path.exists(): raise RuntimeError(f"Data path does not exist: {data_path}") @@ -110,13 +127,28 @@ def process_local_mode(data_path: Path): if not not_useful_dir.exists(): raise RuntimeError(f"'not-useful' subfolder not found in {data_path}") - out_dir = Path("data/processed-text") + # Validate sufficient PDFs + useful_pdfs = list(useful_dir.glob("*.pdf")) + not_useful_pdfs = list(not_useful_dir.glob("*.pdf")) + + print(f"Found {len(useful_pdfs)} PDFs in 'useful' folder") + print(f"Found {len(not_useful_pdfs)} PDFs in 'not-useful' folder") + + if len(useful_pdfs) < 2 or len(not_useful_pdfs) < 2: + print("ERROR: Not enough PDF files!") + print(f"Please add PDF files to:") + print(f" - {useful_dir}") + print(f" - {not_useful_dir}") + print("You need at least 2 PDFs in each folder.") + sys.exit(1) + + out_dir = PROJECT_ROOT / "data" / "processed-text" out_dir.mkdir(parents=True, exist_ok=True) labels: Dict[str, str] = {} for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]: pdf_files = list(folder.glob("*.pdf")) - print(f"Found {len(pdf_files)} PDFs in local folder '{label}'") + print(f"Processing {len(pdf_files)} PDFs in local folder '{label}'") for pdf_path in pdf_files: try: @@ -132,8 +164,52 @@ def process_local_mode(data_path: Path): print(f"Error processing {pdf_path.name}: {e}") continue - write_labels(labels, Path("data/labels.json")) + write_labels(labels, PROJECT_ROOT / "data" / "labels.json") print(f"Wrote {len(labels)} labeled text files.") + return True + + +def generate_results(): + #Generate classification results CSV & JSON using subprocess for reliability + print("\n" + "=" * 50) + print("Generating classification results (CSV & JSON)...") + print("=" * 50) + + useful_folder = PROJECT_ROOT / "data" / "useful" + output_dir = PROJECT_ROOT / "data" / "results" + model_dir = PROJECT_ROOT / "src" / "model" / "models" + classifier_script = PROJECT_ROOT / "src" / "model" / "pdf_classifier.py" + + # Create results directory if it doesn't exist + output_dir.mkdir(parents=True, exist_ok=True) + + if useful_folder.exists() and list(useful_folder.glob("*.pdf")): + print(f"\nClassifying PDFs in: {useful_folder}") + + # Use subprocess for more reliable imports across different systems + result = subprocess.run([ + sys.executable, + str(classifier_script), + "--folder", str(useful_folder), + "--model_dir", str(model_dir), + "--output_dir", str(output_dir) + ], cwd=str(PROJECT_ROOT)) + + if result.returncode == 0: + print("\n" + "=" * 50) + print("OUTPUT FILES CREATED:") + print("=" * 50) + csv_file = output_dir / "classifications.csv" + json_file = output_dir / "classifications.json" + if csv_file.exists(): + print(f" - {csv_file}") + if json_file.exists(): + print(f" - {json_file}") + print("\nYou can open the CSV file!") + else: + print("\n[WARNING] Classification had some issues. Check output above.") + else: + print("\nNo PDFs found in useful folder to classify.") def main(): @@ -142,38 +218,61 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - API mode: python full_pipeline.py --api - Local mode: python full_pipeline.py --local ./data/pdfs + Default (local): python full_pipeline.py + API mode: python full_pipeline.py --api + Custom path: python full_pipeline.py --local C:\\path\\to\\data """ ) - # Create mutually exclusive group for --api and --local - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( + parser.add_argument( "--api", action="store_true", help="Use API mode to download PDFs from Google Drive" ) - group.add_argument( + parser.add_argument( "--local", type=Path, metavar="PATH", - help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)" + default=None, + help="Use local mode with PDFs from specified directory (default: data/)" ) args = parser.parse_args() - if args.local: - print(f"Running in LOCAL mode with data path: {args.local}") - process_local_mode(args.local) - else: # args.api - print("Running in API mode (Google Drive)") - process_api_mode() + print("=" * 50) + print("FracFeedExtractor - Full Training Pipeline") + print("=" * 50) + print(f"Project folder: {PROJECT_ROOT}") + if args.api: + print("\nRunning in API mode (Google Drive)") + success = process_api_mode() + if not success: + sys.exit(1) + else: + data_path = args.local if args.local else PROJECT_ROOT / "data" + print(f"\nRunning in LOCAL mode") + print(f"Data path: {data_path}") + process_local_mode(data_path) + + print("\n" + "=" * 50) print("Beginning model training...") - run([sys.executable, "src/model/train_model.py"]) - print("Training complete.") + print("=" * 50) + train_script = PROJECT_ROOT / "src" / "model" / "train_model.py" + run([sys.executable, str(train_script)]) + + print("\n" + "=" * 50) + print("TRAINING COMPLETE!") + print("=" * 50) + print(f"Model saved to: {PROJECT_ROOT / 'src' / 'model' / 'models'}") + + # Generate CSV/JSON results + generate_results() + + print("\n" + "=" * 50) + print("All Done!") + print("=" * 50) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/model/models/pdf_classifier.json b/src/model/models/pdf_classifier.json new file mode 100644 index 0000000..4c76f05 --- /dev/null +++ b/src/model/models/pdf_classifier.json @@ -0,0 +1 @@ +{"learner":{"attributes":{"best_iteration":"0","best_score":"0.7031644284725189"},"feature_names":[],"feature_types":[],"gradient_booster":{"model":{"cats":{"enc":[],"feature_segments":[],"sorted_idx":[]},"gbtree_model_param":{"num_parallel_tree":"1","num_trees":"21"},"iteration_indptr":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21],"tree_info":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"trees":[{"base_weights":[-0E0,-4.5454547E-2,3.3333335E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":0,"left_children":[1,-1,-1],"loss_changes":[3.6060605E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-4.5454547E-2,3.3333335E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.75E0,1.75E0,2E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.6550421E-2,4.440188E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":1,"left_children":[1,-1,-1],"loss_changes":[2.9434924E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.6550421E-2,4.440188E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.4987297E0,1.7492157E0,1.7495139E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.5713807E-2,3.1201271E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":2,"left_children":[1,-1,-1],"loss_changes":[1.6023228E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[3.810747E-3,-2.5713807E-2,3.1201271E-2],"split_indices":[386,0,0],"split_type":[0,0,0],"sum_hessian":[2.9964888E0,1.7483754E0,1.2481135E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7401117E-2,2.5258869E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":3,"left_children":[1,-1,-1],"loss_changes":[2.0981195E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[2.056987E-2,-3.7401117E-2,2.5258869E-2],"split_indices":[537,0,0],"split_type":[0,0,0],"sum_hessian":[3.243742E0,1.4969167E0,1.7468252E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.6906194E-2,4.2121045E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":4,"left_children":[1,-1,-1],"loss_changes":[3.306724E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.6906194E-2,4.2121045E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2390664E0,1.4950526E0,1.7440138E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-1.04854666E-1,-4.094534E-2,1.6049905E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":5,"left_children":[1,-1,-1],"loss_changes":[2.0475786E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-4.094534E-2,1.6049905E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2317743E0,1.7397995E0,1.491975E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.9208884E-2,3.4704812E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":6,"left_children":[1,-1,-1],"loss_changes":[2.2169628E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.9208884E-2,3.4704812E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.472232E0,1.9848684E0,1.4873638E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.8985728E-2,3.3738654E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":7,"left_children":[1,-1,-1],"loss_changes":[1.882357E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.8985728E-2,3.3738654E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.720318E0,1.2380944E0,1.4822236E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.214106E-2,3.7840564E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":8,"left_children":[1,-1,-1],"loss_changes":[2.0949135E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.214106E-2,3.7840564E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.4542952E0,1.7320973E0,1.7221978E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-1.4408588E-2,3.178102E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":9,"left_children":[1,-1,-1],"loss_changes":[1.2034733E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-1.4408588E-2,3.178102E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.949526E0,1.4806063E0,1.4689196E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.2085627E-2,3.265416E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":10,"left_children":[1,-1,-1],"loss_changes":[2.0702496E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.2085627E-2,3.265416E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.9391243E0,1.4706066E0,1.4685178E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.4657268E-2,1.3228178E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":11,"left_children":[1,-1,-1],"loss_changes":[7.1198344E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[2.056987E-2,-2.4657268E-2,1.3228178E-2],"split_indices":[537,0,0],"split_type":[0,0,0],"sum_hessian":[2.6867905E0,1.2167447E0,1.4700457E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7201285E-2,2.9706245E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":12,"left_children":[1,-1,-1],"loss_changes":[2.3646326E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.7201285E-2,2.9706245E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.1595213E0,1.7090204E0,1.4505011E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.9499192E-2,3.9527934E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":13,"left_children":[1,-1,-1],"loss_changes":[2.6844969E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.9499192E-2,3.9527934E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.379565E0,1.447219E0,1.932346E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.8973341E-2,1.7765379E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":14,"left_children":[1,-1,-1],"loss_changes":[1.1598339E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.8973341E-2,1.7765379E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.1344721E0,1.4417509E0,1.6927214E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.3713494E-2,2.7436396E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":15,"left_children":[1,-1,-1],"loss_changes":[1.3900027E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.3448534E-3,-2.3713494E-2,2.7436396E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.3583946E0,1.9331914E0,1.4252031E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-1.7263865E-2,2.1163303E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":16,"left_children":[1,-1,-1],"loss_changes":[7.109931E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-1.7263865E-2,2.1163303E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.8661036E0,1.6826108E0,1.1834928E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.7158424E-2,2.1919927E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":17,"left_children":[1,-1,-1],"loss_changes":[1.2738018E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.7158424E-2,2.1919927E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.332507E0,1.4205837E0,1.911923E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7158858E-2,2.5633803E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":18,"left_children":[1,-1,-1],"loss_changes":[2.2300358E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.3448534E-3,-3.7158858E-2,2.5633803E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2962039E0,1.8948877E0,1.4013162E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.551791E-2,1.628439E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":19,"left_children":[1,-1,-1],"loss_changes":[9.076358E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.551791E-2,1.628439E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.0655773E0,1.3985367E0,1.6670406E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-1.558051E-1,-3.1970523E-2,2.2381728E-3],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":20,"left_children":[1,-1,-1],"loss_changes":[9.912287E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.957521E-3,-3.1970523E-2,2.2381728E-3],"split_indices":[352,0,0],"split_type":[0,0,0],"sum_hessian":[2.7935517E0,1.6391398E0,1.1544119E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}}]},"name":"gbtree"},"learner_model_param":{"base_score":"[5E-1]","boost_from_average":"1","num_class":"0","num_feature":"10000","num_target":"1"},"objective":{"name":"binary:logistic","reg_loss_param":{"scale_pos_weight":"1"}}},"version":[3,1,2]} \ No newline at end of file diff --git a/src/model/pdf_classifier.py b/src/model/pdf_classifier.py index 8b9c2ba..e4c661a 100644 --- a/src/model/pdf_classifier.py +++ b/src/model/pdf_classifier.py @@ -3,20 +3,46 @@ from pathlib import Path import xgboost as xgb import sys +import os +import time + +# Setup project root path, must be before other imports +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) +os.chdir(PROJECT_ROOT) -sys.path.append(str(Path(__file__).resolve().parents[2])) from src.preprocessing.pdf_text_extraction import extract_text_from_pdf +# Try to import structured output module +try: + from src.output.structured_output import ClassificationResult, OutputManager + STRUCTURED_OUTPUT_AVAILABLE = True +except ImportError as e: + print(f"[WARNING] Could not import structured_output: {e}") + STRUCTURED_OUTPUT_AVAILABLE = False + ClassificationResult = None + OutputManager = None + -# Classify a single PDF as useful or not useful based on its text content. -def classify_pdf(pdf_path, model_dir="src/model/models"): +def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False): + #Classify a single PDF as useful or not useful. + + start_time = time.time() model_path = Path(model_dir) / "pdf_classifier.json" vectorizer_path = Path(model_dir) / "tfidf_vectorizer.pkl" encoder_path = Path(model_dir) / "label_encoder.pkl" + filename = Path(pdf_path).name if not model_path.exists() or not vectorizer_path.exists() or not encoder_path.exists(): print(f"[ERROR] Missing model, encoder, or vectorizer in {model_dir}") - return + if return_result and STRUCTURED_OUTPUT_AVAILABLE: + return ClassificationResult( + filename=filename, + classification="unknown", + confidence=0.0, + error=f"Missing model files in {model_dir}", + ) + return None # Load model, encoder, and TF-IDF vectorizer model = xgb.Booster() @@ -28,14 +54,21 @@ def classify_pdf(pdf_path, model_dir="src/model/models"): text = extract_text_from_pdf(pdf_path) if not text.strip(): print(f"[ERROR] No text extracted from {pdf_path}. Skipping classification.") - return + if return_result and STRUCTURED_OUTPUT_AVAILABLE: + return ClassificationResult( + filename=filename, + classification="unknown", + confidence=0.0, + error="No text extracted from PDF", + ) + return None # Transform text into vectorized TF-IDF format X_vec = vectorizer.transform([text]) # Wrap in DMatrix for XGBoost prediction dtest = xgb.DMatrix(X_vec) - pred_prob = model.predict(dtest)[0] + pred_prob = float(model.predict(dtest)[0]) pred_class = 1 if pred_prob >= 0.70 else 0 # Convert numeric class back into original label name @@ -46,16 +79,82 @@ def classify_pdf(pdf_path, model_dir="src/model/models"): else: confidence = pred_prob + processing_time = time.time() - start_time + print("\n=== PDF Classification Result ===") - print(f" File: {Path(pdf_path).name}") + print(f" File: {filename}") print(f" Prediction: {pred_label} ({confidence:.2%} confidence)") print("=================================\n") + if return_result and STRUCTURED_OUTPUT_AVAILABLE: + return ClassificationResult( + filename=filename, + classification=pred_label, + confidence=float(confidence), + processing_time_seconds=processing_time, + text_length=len(text), + ) + return None + + +def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/results"): + #Classify all PDFs in a folder and export results. + + if not STRUCTURED_OUTPUT_AVAILABLE: + print("[ERROR] Structured output module not available.") + print("Make sure src/output/structured_output.py exists.") + return {} + + folder = Path(folder_path) + if not folder.exists(): + print(f"[ERROR] Folder not found: {folder_path}") + return {} + + pdf_files = list(folder.glob("*.pdf")) + if not pdf_files: + print(f"[WARN] No PDF files found in {folder_path}") + return {} + + print(f"Found {len(pdf_files)} PDF files to classify.") + + manager = OutputManager(output_dir=output_dir) + + for i, pdf_path in enumerate(pdf_files, 1): + print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_path.name}") + result = classify_pdf(str(pdf_path), model_dir=model_dir, return_result=True) + if result: + manager.add_classification(result) + + # Export results + paths = manager.export_all() + + # Print summary + print("\n=== Classification Summary ===") + summary = manager.get_summary() + print(f" Total files: {summary['total_classifications']}") + print(f" Useful: {summary['useful_count']}") + print(f" Not useful: {summary['not_useful_count']}") + print(f" Avg confidence: {summary['average_classification_confidence']:.2%}") + print("==============================\n") + + return paths + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Classify a PDF as useful or not useful.") - parser.add_argument("--pdf-path", type=str, help="Path to the PDF file to classify.") + parser = argparse.ArgumentParser(description="Classify PDFs as useful or not useful.") + parser.add_argument("--pdf-path", type=str, help="Path to a single PDF file to classify.") + parser.add_argument("--folder", type=str, help="Path to a folder of PDFs to classify.") parser.add_argument("--model_dir", type=str, default="src/model/models", help="Directory containing the trained model and TF-IDF vectorizer.") + parser.add_argument("--output_dir", type=str, default="data/results", help="Directory for output files (JSON/CSV).") args = parser.parse_args() - classify_pdf(args.pdf_path, args.model_dir) + if args.folder: + paths = classify_folder(args.folder, args.model_dir, args.output_dir) + if paths: + print("Exported files:") + for name, path in paths.items(): + print(f" {name}: {path}") + elif args.pdf_path: + classify_pdf(args.pdf_path, args.model_dir) + else: + parser.print_help() \ No newline at end of file diff --git a/src/output/__init__ b/src/output/__init__ new file mode 100644 index 0000000..e38d94c --- /dev/null +++ b/src/output/__init__ @@ -0,0 +1,17 @@ +"""Output module for structured data export.""" + +from .structured_output import ( + ClassificationResult, + ExtractionResult, + OutputManager, + export_to_json, + export_to_csv, +) + +__all__ = [ + "ClassificationResult", + "ExtractionResult", + "OutputManager", + "export_to_json", + "export_to_csv", +] diff --git a/src/output/structured_output.py b/src/output/structured_output.py new file mode 100644 index 0000000..dfeca0e --- /dev/null +++ b/src/output/structured_output.py @@ -0,0 +1,317 @@ +"""Structured Output Module + +This module handles the export of classification and data extraction results +to JSON and CSV formats with clear provenance and uncertainty tracking. + +Usage: + from src.output import OutputManager, ClassificationResult + + # Create a classification result + result = ClassificationResult( + filename="Adams_1989.pdf", + classification="useful", + confidence=0.92, + model_version="1.0.0" + ) + + # Export results + manager = OutputManager(output_dir="data/results") + manager.add_classification(result) + manager.export_all() +""" + +from __future__ import annotations + +import csv +import json +from dataclasses import dataclass, field, asdict +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any + + +@dataclass +class ClassificationResult: + #Stores the result of classifying a single PDF. + + filename: str + classification: str + confidence: float + model_version: str = "1.0.0" + processing_time_seconds: Optional[float] = None + timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat()) + text_length: Optional[int] = None + error: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + #Convert to dictionary for JSON serialization + return asdict(self) + + +@dataclass +class ExtractionResult: + #Stores extracted data from a 'useful' PDF + filename: str + predator_species: Optional[str] = None + predator_common_name: Optional[str] = None + survey_location: Optional[str] = None + survey_latitude: Optional[float] = None + survey_longitude: Optional[float] = None + survey_year: Optional[int] = None + survey_month: Optional[int] = None + total_stomachs_examined: Optional[int] = None + empty_stomachs: Optional[int] = None + non_empty_stomachs: Optional[int] = None + fraction_feeding: Optional[float] = None + sample_size_confidence: Optional[float] = None + extraction_confidence: Optional[float] = None + extraction_notes: Optional[str] = None + source_text_snippet: Optional[str] = None + timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat()) + extractor_version: str = "1.0.0" + error: Optional[str] = None + + def __post_init__(self): + #Calculate fraction_feeding if stomach counts are available + if ( + self.fraction_feeding is None + and self.total_stomachs_examined is not None + and self.non_empty_stomachs is not None + and self.total_stomachs_examined > 0 + ): + self.fraction_feeding = self.non_empty_stomachs / self.total_stomachs_examined + + def to_dict(self) -> Dict[str, Any]: + #Convert to dictionary for JSON serialization + return asdict(self) + + +@dataclass +class PipelineResult: + #Combined result from the full pipeline (classification and extraction). + + filename: str + classification: ClassificationResult + extraction: Optional[ExtractionResult] = None + + def to_dict(self) -> Dict[str, Any]: + #Convert to dictionary for JSON serialization + result = { + "filename": self.filename, + "classification": self.classification.to_dict(), + } + if self.extraction: + result["extraction"] = self.extraction.to_dict() + return result + + +class OutputManager: + #Manages collection and export of pipeline results. + + def __init__(self, output_dir: str = "data/results"): + #Initialize the OutputManager. + + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.classifications: List[ClassificationResult] = [] + self.extractions: List[ExtractionResult] = [] + self.pipeline_results: List[PipelineResult] = [] + + def add_classification(self, result: ClassificationResult) -> None: + #Add a classification result to the collection + self.classifications.append(result) + + def add_extraction(self, result: ExtractionResult) -> None: + #Add an extraction result to the collection. + self.extractions.append(result) + + def add_pipeline_result(self, result: PipelineResult) -> None: + #Add a complete pipeline result to the collection + self.pipeline_results.append(result) + self.classifications.append(result.classification) + if result.extraction: + self.extractions.append(result.extraction) + + def export_classifications_json(self, filename: str = "classifications.json") -> Path: + #Export classification results to JSON. + output_path = self.output_dir / filename + data = { + "metadata": { + "export_timestamp": datetime.utcnow().isoformat(), + "total_files": len(self.classifications), + "useful_count": sum(1 for c in self.classifications if c.classification == "useful"), + "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"), + }, + "results": [c.to_dict() for c in self.classifications], + } + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + print(f"[INFO] Classifications exported to {output_path}") + return output_path + + def export_classifications_csv(self, filename: str = "classifications.csv") -> Path: + #Export classification results to CSV. + output_path = self.output_dir / filename + if not self.classifications: + print("[WARN] No classifications to export.") + return output_path + + fieldnames = list(self.classifications[0].to_dict().keys()) + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for result in self.classifications: + writer.writerow(result.to_dict()) + print(f"[INFO] Classifications exported to {output_path}") + return output_path + + def export_extractions_json(self, filename: str = "extractions.json") -> Path: + #Export extraction results to JSON. + output_path = self.output_dir / filename + data = { + "metadata": { + "export_timestamp": datetime.utcnow().isoformat(), + "total_extractions": len(self.extractions), + "successful_extractions": sum(1 for e in self.extractions if e.error is None), + }, + "results": [e.to_dict() for e in self.extractions], + } + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + print(f"[INFO] Extractions exported to {output_path}") + return output_path + + def export_extractions_csv(self, filename: str = "extractions.csv") -> Path: + #Export extraction results to CSV. + output_path = self.output_dir / filename + if not self.extractions: + print("[WARN] No extractions to export.") + return output_path + + fieldnames = list(self.extractions[0].to_dict().keys()) + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for result in self.extractions: + writer.writerow(result.to_dict()) + print(f"[INFO] Extractions exported to {output_path}") + return output_path + + def export_all(self, prefix: str = "") -> Dict[str, Path]: + #Export all results to both JSON and CSV formats. + + paths = {} + if self.classifications: + paths["classifications_json"] = self.export_classifications_json( + f"{prefix}classifications.json" if prefix else "classifications.json" + ) + paths["classifications_csv"] = self.export_classifications_csv( + f"{prefix}classifications.csv" if prefix else "classifications.csv" + ) + if self.extractions: + paths["extractions_json"] = self.export_extractions_json( + f"{prefix}extractions.json" if prefix else "extractions.json" + ) + paths["extractions_csv"] = self.export_extractions_csv( + f"{prefix}extractions.csv" if prefix else "extractions.csv" + ) + return paths + + def get_summary(self) -> Dict[str, Any]: + #Get a summary of all collected results. + return { + "total_classifications": len(self.classifications), + "useful_count": sum(1 for c in self.classifications if c.classification == "useful"), + "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"), + "total_extractions": len(self.extractions), + "successful_extractions": sum(1 for e in self.extractions if e.error is None), + "average_classification_confidence": ( + sum(c.confidence for c in self.classifications) / len(self.classifications) + if self.classifications + else 0.0 + ), + } + + +# Convenience functions for simple use cases +def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path: + #Export a list of result dictionaries to JSON. + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2) + return path + + +def export_to_csv(results: List[Dict[str, Any]], output_path: str) -> Path: + #Export a list of result dictionaries to CSV. + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + if not results: + # Create empty file + path.touch() + return path + + fieldnames = list(results[0].keys()) + with open(path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + return path + + +if __name__ == "__main__": + # Example usage demonstration + print("=== FracFeedExtractor Structured Output ===\n") + + # Create sample classification results + class_results = [ + ClassificationResult( + filename="Adams_1989.pdf", + classification="useful", + confidence=0.92, + text_length=45000, + ), + ClassificationResult( + filename="Rosalino_2009.pdf", + classification="not-useful", + confidence=0.85, + text_length=32000, + ), + ] + + # Create sample extraction result + extraction = ExtractionResult( + filename="Adams_1989.pdf", + predator_species="Pygoscelis papua", + predator_common_name="Gentoo Penguin", + survey_location="Marion Island, sub-Antarctic", + survey_latitude=-46.88, + survey_longitude=37.90, + survey_year=1984, + total_stomachs_examined=144, + empty_stomachs=12, + non_empty_stomachs=132, + extraction_confidence=0.88, + source_text_snippet="A total of 144 stomach samples was collected...", + ) + + # Use OutputManager + manager = OutputManager(output_dir="data/results") + + for result in class_results: + manager.add_classification(result) + + manager.add_extraction(extraction) + + # Export all results + paths = manager.export_all() + + print("\nExported files:") + for name, path in paths.items(): + print(f" {name}: {path}") + + print("\nSummary:") + summary = manager.get_summary() + for key, value in summary.items(): + print(f" {key}: {value}") diff --git a/tests/test_structured_output.py b/tests/test_structured_output.py new file mode 100644 index 0000000..d6618da --- /dev/null +++ b/tests/test_structured_output.py @@ -0,0 +1,319 @@ +"""Tests for the structured output module.""" + +import pytest +import json +import csv +from pathlib import Path +from src.output.structured_output import ( + ClassificationResult, + ExtractionResult, + PipelineResult, + OutputManager, + export_to_json, + export_to_csv, +) + + +class TestClassificationResult: + #Tests for ClassificationResult dataclass + + def test_create_basic_result(self): + result = ClassificationResult( + filename="test.pdf", + classification="useful", + confidence=0.85, + ) + assert result.filename == "test.pdf" + assert result.classification == "useful" + assert result.confidence == 0.85 + assert result.model_version == "1.0.0" + + def test_to_dict(self): + result = ClassificationResult( + filename="test.pdf", + classification="useful", + confidence=0.85, + ) + d = result.to_dict() + assert d["filename"] == "test.pdf" + assert d["classification"] == "useful" + assert d["confidence"] == 0.85 + assert "timestamp" in d + + def test_with_error(self): + result = ClassificationResult( + filename="bad.pdf", + classification="unknown", + confidence=0.0, + error="Failed to extract text", + ) + assert result.error == "Failed to extract text" + + +class TestExtractionResult: + #Tests for ExtractionResult dataclass + + def test_create_basic_result(self): + result = ExtractionResult( + filename="test.pdf", + predator_species="Canis lupus", + survey_year=2020, + ) + assert result.filename == "test.pdf" + assert result.predator_species == "Canis lupus" + assert result.survey_year == 2020 + + def test_fraction_feeding_auto_calculation(self): + result = ExtractionResult( + filename="test.pdf", + total_stomachs_examined=100, + non_empty_stomachs=75, + ) + assert result.fraction_feeding == 0.75 + + def test_fraction_feeding_not_calculated_when_missing_data(self): + result = ExtractionResult( + filename="test.pdf", + total_stomachs_examined=100, + # non_empty_stomachs not provided + ) + assert result.fraction_feeding is None + + def test_fraction_feeding_not_overwritten(self): + result = ExtractionResult( + filename="test.pdf", + total_stomachs_examined=100, + non_empty_stomachs=75, + fraction_feeding=0.80, # Manually set + ) + assert result.fraction_feeding == 0.80 # Should not be overwritten + + def test_to_dict(self): + result = ExtractionResult( + filename="test.pdf", + predator_species="Canis lupus", + survey_location="Yellowstone", + survey_year=2020, + total_stomachs_examined=50, + empty_stomachs=10, + non_empty_stomachs=40, + ) + d = result.to_dict() + assert d["predator_species"] == "Canis lupus" + assert d["survey_location"] == "Yellowstone" + assert d["fraction_feeding"] == 0.80 + + +class TestPipelineResult: + #Tests for PipelineResult dataclass + + def test_create_useful_result(self): + classification = ClassificationResult( + filename="test.pdf", + classification="useful", + confidence=0.9, + ) + extraction = ExtractionResult( + filename="test.pdf", + predator_species="Canis lupus", + ) + pipeline = PipelineResult( + filename="test.pdf", + classification=classification, + extraction=extraction, + ) + assert pipeline.extraction is not None + + def test_create_not_useful_result(self): + classification = ClassificationResult( + filename="test.pdf", + classification="not-useful", + confidence=0.85, + ) + pipeline = PipelineResult( + filename="test.pdf", + classification=classification, + extraction=None, + ) + assert pipeline.extraction is None + + def test_to_dict(self): + classification = ClassificationResult( + filename="test.pdf", + classification="useful", + confidence=0.9, + ) + pipeline = PipelineResult( + filename="test.pdf", + classification=classification, + ) + d = pipeline.to_dict() + assert "classification" in d + assert d["classification"]["confidence"] == 0.9 + + +class TestOutputManager: + #Tests for OutputManager class + + @pytest.fixture + def output_dir(self, tmp_path): + return tmp_path / "results" + + @pytest.fixture + def manager(self, output_dir): + return OutputManager(output_dir=str(output_dir)) + + @pytest.fixture + def sample_classifications(self): + return [ + ClassificationResult(filename="a.pdf", classification="useful", confidence=0.9), + ClassificationResult(filename="b.pdf", classification="not-useful", confidence=0.8), + ClassificationResult(filename="c.pdf", classification="useful", confidence=0.95), + ] + + @pytest.fixture + def sample_extractions(self): + return [ + ExtractionResult( + filename="a.pdf", + predator_species="Species A", + total_stomachs_examined=100, + non_empty_stomachs=80, + ), + ExtractionResult( + filename="c.pdf", + predator_species="Species C", + total_stomachs_examined=50, + non_empty_stomachs=40, + ), + ] + + def test_creates_output_directory(self, output_dir): + OutputManager(output_dir=str(output_dir)) + assert output_dir.exists() + + def test_add_classification(self, manager, sample_classifications): + for c in sample_classifications: + manager.add_classification(c) + assert len(manager.classifications) == 3 + + def test_add_extraction(self, manager, sample_extractions): + for e in sample_extractions: + manager.add_extraction(e) + assert len(manager.extractions) == 2 + + def test_export_classifications_json(self, manager, sample_classifications, output_dir): + for c in sample_classifications: + manager.add_classification(c) + + path = manager.export_classifications_json() + assert path.exists() + + with open(path) as f: + data = json.load(f) + + assert "metadata" in data + assert data["metadata"]["total_files"] == 3 + assert data["metadata"]["useful_count"] == 2 + assert len(data["results"]) == 3 + + def test_export_classifications_csv(self, manager, sample_classifications, output_dir): + for c in sample_classifications: + manager.add_classification(c) + + path = manager.export_classifications_csv() + assert path.exists() + + with open(path, newline="") as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 3 + assert rows[0]["filename"] == "a.pdf" + + def test_export_extractions_json(self, manager, sample_extractions, output_dir): + for e in sample_extractions: + manager.add_extraction(e) + + path = manager.export_extractions_json() + assert path.exists() + + with open(path) as f: + data = json.load(f) + + assert len(data["results"]) == 2 + assert data["results"][0]["predator_species"] == "Species A" + + def test_export_extractions_csv(self, manager, sample_extractions, output_dir): + for e in sample_extractions: + manager.add_extraction(e) + + path = manager.export_extractions_csv() + assert path.exists() + + with open(path, newline="") as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 2 + + def test_export_all(self, manager, sample_classifications, sample_extractions, output_dir): + for c in sample_classifications: + manager.add_classification(c) + for e in sample_extractions: + manager.add_extraction(e) + + paths = manager.export_all() + + assert "classifications_json" in paths + assert "classifications_csv" in paths + assert "extractions_json" in paths + assert "extractions_csv" in paths + + for path in paths.values(): + assert path.exists() + + def test_get_summary(self, manager, sample_classifications, sample_extractions): + for c in sample_classifications: + manager.add_classification(c) + for e in sample_extractions: + manager.add_extraction(e) + + summary = manager.get_summary() + + assert summary["total_classifications"] == 3 + assert summary["useful_count"] == 2 + assert summary["not_useful_count"] == 1 + assert summary["total_extractions"] == 2 + + +class TestConvenienceFunctions: + #Tests for standalone export functions. + + def test_export_to_json(self, tmp_path): + results = [{"name": "test", "value": 123}] + path = export_to_json(results, str(tmp_path / "test.json")) + + assert path.exists() + with open(path) as f: + data = json.load(f) + assert data == results + + def test_export_to_csv(self, tmp_path): + results = [ + {"name": "a", "value": 1}, + {"name": "b", "value": 2}, + ] + path = export_to_csv(results, str(tmp_path / "test.csv")) + + assert path.exists() + with open(path, newline="") as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 2 + assert rows[0]["name"] == "a" + + def test_export_empty_csv(self, tmp_path): + path = export_to_csv([], str(tmp_path / "empty.csv")) + assert path.exists() From 9e27236d12826f95c2cc6eeb80cd5df9e62277e1 Mon Sep 17 00:00:00 2001 From: sillygoose Date: Tue, 2 Dec 2025 17:00:08 -0800 Subject: [PATCH 2/2] fixed format --- src/model/pdf_classifier.py | 27 +++++++------ src/output/structured_output.py | 71 +++++++++++++-------------------- tests/test_structured_output.py | 10 ++--- 3 files changed, 46 insertions(+), 62 deletions(-) diff --git a/src/model/pdf_classifier.py b/src/model/pdf_classifier.py index e4c661a..1202712 100644 --- a/src/model/pdf_classifier.py +++ b/src/model/pdf_classifier.py @@ -16,6 +16,7 @@ # Try to import structured output module try: from src.output.structured_output import ClassificationResult, OutputManager + STRUCTURED_OUTPUT_AVAILABLE = True except ImportError as e: print(f"[WARNING] Could not import structured_output: {e}") @@ -25,8 +26,8 @@ def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False): - #Classify a single PDF as useful or not useful. - + # Classify a single PDF as useful or not useful. + start_time = time.time() model_path = Path(model_dir) / "pdf_classifier.json" vectorizer_path = Path(model_dir) / "tfidf_vectorizer.pkl" @@ -98,36 +99,36 @@ def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False): def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/results"): - #Classify all PDFs in a folder and export results. - + # Classify all PDFs in a folder and export results. + if not STRUCTURED_OUTPUT_AVAILABLE: print("[ERROR] Structured output module not available.") print("Make sure src/output/structured_output.py exists.") return {} - + folder = Path(folder_path) if not folder.exists(): print(f"[ERROR] Folder not found: {folder_path}") return {} - + pdf_files = list(folder.glob("*.pdf")) if not pdf_files: print(f"[WARN] No PDF files found in {folder_path}") return {} - + print(f"Found {len(pdf_files)} PDF files to classify.") - + manager = OutputManager(output_dir=output_dir) - + for i, pdf_path in enumerate(pdf_files, 1): print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_path.name}") result = classify_pdf(str(pdf_path), model_dir=model_dir, return_result=True) if result: manager.add_classification(result) - + # Export results paths = manager.export_all() - + # Print summary print("\n=== Classification Summary ===") summary = manager.get_summary() @@ -136,7 +137,7 @@ def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/ print(f" Not useful: {summary['not_useful_count']}") print(f" Avg confidence: {summary['average_classification_confidence']:.2%}") print("==============================\n") - + return paths @@ -157,4 +158,4 @@ def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/ elif args.pdf_path: classify_pdf(args.pdf_path, args.model_dir) else: - parser.print_help() \ No newline at end of file + parser.print_help() diff --git a/src/output/structured_output.py b/src/output/structured_output.py index dfeca0e..5f63c96 100644 --- a/src/output/structured_output.py +++ b/src/output/structured_output.py @@ -1,4 +1,4 @@ -"""Structured Output Module +"""Structured Output Module This module handles the export of classification and data extraction results to JSON and CSV formats with clear provenance and uncertainty tracking. @@ -32,7 +32,7 @@ @dataclass class ClassificationResult: - #Stores the result of classifying a single PDF. + # Stores the result of classifying a single PDF. filename: str classification: str @@ -44,13 +44,13 @@ class ClassificationResult: error: Optional[str] = None def to_dict(self) -> Dict[str, Any]: - #Convert to dictionary for JSON serialization + # Convert to dictionary for JSON serialization return asdict(self) @dataclass class ExtractionResult: - #Stores extracted data from a 'useful' PDF + # Stores extracted data from a 'useful' PDF filename: str predator_species: Optional[str] = None predator_common_name: Optional[str] = None @@ -72,30 +72,25 @@ class ExtractionResult: error: Optional[str] = None def __post_init__(self): - #Calculate fraction_feeding if stomach counts are available - if ( - self.fraction_feeding is None - and self.total_stomachs_examined is not None - and self.non_empty_stomachs is not None - and self.total_stomachs_examined > 0 - ): + # Calculate fraction_feeding if stomach counts are available + if self.fraction_feeding is None and self.total_stomachs_examined is not None and self.non_empty_stomachs is not None and self.total_stomachs_examined > 0: self.fraction_feeding = self.non_empty_stomachs / self.total_stomachs_examined def to_dict(self) -> Dict[str, Any]: - #Convert to dictionary for JSON serialization + # Convert to dictionary for JSON serialization return asdict(self) @dataclass class PipelineResult: - #Combined result from the full pipeline (classification and extraction). + # Combined result from the full pipeline (classification and extraction). filename: str classification: ClassificationResult extraction: Optional[ExtractionResult] = None def to_dict(self) -> Dict[str, Any]: - #Convert to dictionary for JSON serialization + # Convert to dictionary for JSON serialization result = { "filename": self.filename, "classification": self.classification.to_dict(), @@ -106,10 +101,10 @@ def to_dict(self) -> Dict[str, Any]: class OutputManager: - #Manages collection and export of pipeline results. + # Manages collection and export of pipeline results. def __init__(self, output_dir: str = "data/results"): - #Initialize the OutputManager. + # Initialize the OutputManager. self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) @@ -118,22 +113,22 @@ def __init__(self, output_dir: str = "data/results"): self.pipeline_results: List[PipelineResult] = [] def add_classification(self, result: ClassificationResult) -> None: - #Add a classification result to the collection + # Add a classification result to the collection self.classifications.append(result) def add_extraction(self, result: ExtractionResult) -> None: - #Add an extraction result to the collection. + # Add an extraction result to the collection. self.extractions.append(result) def add_pipeline_result(self, result: PipelineResult) -> None: - #Add a complete pipeline result to the collection + # Add a complete pipeline result to the collection self.pipeline_results.append(result) self.classifications.append(result.classification) if result.extraction: self.extractions.append(result.extraction) def export_classifications_json(self, filename: str = "classifications.json") -> Path: - #Export classification results to JSON. + # Export classification results to JSON. output_path = self.output_dir / filename data = { "metadata": { @@ -150,7 +145,7 @@ def export_classifications_json(self, filename: str = "classifications.json") -> return output_path def export_classifications_csv(self, filename: str = "classifications.csv") -> Path: - #Export classification results to CSV. + # Export classification results to CSV. output_path = self.output_dir / filename if not self.classifications: print("[WARN] No classifications to export.") @@ -166,7 +161,7 @@ def export_classifications_csv(self, filename: str = "classifications.csv") -> P return output_path def export_extractions_json(self, filename: str = "extractions.json") -> Path: - #Export extraction results to JSON. + # Export extraction results to JSON. output_path = self.output_dir / filename data = { "metadata": { @@ -182,7 +177,7 @@ def export_extractions_json(self, filename: str = "extractions.json") -> Path: return output_path def export_extractions_csv(self, filename: str = "extractions.csv") -> Path: - #Export extraction results to CSV. + # Export extraction results to CSV. output_path = self.output_dir / filename if not self.extractions: print("[WARN] No extractions to export.") @@ -198,44 +193,32 @@ def export_extractions_csv(self, filename: str = "extractions.csv") -> Path: return output_path def export_all(self, prefix: str = "") -> Dict[str, Path]: - #Export all results to both JSON and CSV formats. + # Export all results to both JSON and CSV formats. paths = {} if self.classifications: - paths["classifications_json"] = self.export_classifications_json( - f"{prefix}classifications.json" if prefix else "classifications.json" - ) - paths["classifications_csv"] = self.export_classifications_csv( - f"{prefix}classifications.csv" if prefix else "classifications.csv" - ) + paths["classifications_json"] = self.export_classifications_json(f"{prefix}classifications.json" if prefix else "classifications.json") + paths["classifications_csv"] = self.export_classifications_csv(f"{prefix}classifications.csv" if prefix else "classifications.csv") if self.extractions: - paths["extractions_json"] = self.export_extractions_json( - f"{prefix}extractions.json" if prefix else "extractions.json" - ) - paths["extractions_csv"] = self.export_extractions_csv( - f"{prefix}extractions.csv" if prefix else "extractions.csv" - ) + paths["extractions_json"] = self.export_extractions_json(f"{prefix}extractions.json" if prefix else "extractions.json") + paths["extractions_csv"] = self.export_extractions_csv(f"{prefix}extractions.csv" if prefix else "extractions.csv") return paths def get_summary(self) -> Dict[str, Any]: - #Get a summary of all collected results. + # Get a summary of all collected results. return { "total_classifications": len(self.classifications), "useful_count": sum(1 for c in self.classifications if c.classification == "useful"), "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"), "total_extractions": len(self.extractions), "successful_extractions": sum(1 for e in self.extractions if e.error is None), - "average_classification_confidence": ( - sum(c.confidence for c in self.classifications) / len(self.classifications) - if self.classifications - else 0.0 - ), + "average_classification_confidence": (sum(c.confidence for c in self.classifications) / len(self.classifications) if self.classifications else 0.0), } # Convenience functions for simple use cases def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path: - #Export a list of result dictionaries to JSON. + # Export a list of result dictionaries to JSON. path = Path(output_path) path.parent.mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -244,7 +227,7 @@ def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path: def export_to_csv(results: List[Dict[str, Any]], output_path: str) -> Path: - #Export a list of result dictionaries to CSV. + # Export a list of result dictionaries to CSV. path = Path(output_path) path.parent.mkdir(parents=True, exist_ok=True) if not results: diff --git a/tests/test_structured_output.py b/tests/test_structured_output.py index d6618da..56c9def 100644 --- a/tests/test_structured_output.py +++ b/tests/test_structured_output.py @@ -15,7 +15,7 @@ class TestClassificationResult: - #Tests for ClassificationResult dataclass + # Tests for ClassificationResult dataclass def test_create_basic_result(self): result = ClassificationResult( @@ -51,7 +51,7 @@ def test_with_error(self): class TestExtractionResult: - #Tests for ExtractionResult dataclass + # Tests for ExtractionResult dataclass def test_create_basic_result(self): result = ExtractionResult( @@ -105,7 +105,7 @@ def test_to_dict(self): class TestPipelineResult: - #Tests for PipelineResult dataclass + # Tests for PipelineResult dataclass def test_create_useful_result(self): classification = ClassificationResult( @@ -153,7 +153,7 @@ def test_to_dict(self): class TestOutputManager: - #Tests for OutputManager class + # Tests for OutputManager class @pytest.fixture def output_dir(self, tmp_path): @@ -288,7 +288,7 @@ def test_get_summary(self, manager, sample_classifications, sample_extractions): class TestConvenienceFunctions: - #Tests for standalone export functions. + # Tests for standalone export functions. def test_export_to_json(self, tmp_path): results = [{"name": "test", "value": 123}]