From eae548afa016b9da59d224bee798db4569714081 Mon Sep 17 00:00:00 2001
From: sillygoose <zahraa.ahmed2410@gmail.com>
Date: Tue, 2 Dec 2025 13:34:59 -0800
Subject: [PATCH 1/2] Add structured output export to CSV and JSON

---
 data/labels.json                     | 687 +--------------------------
 scripts/full_pipeline.py             | 191 ++++++--
 src/model/models/pdf_classifier.json |   1 +
 src/model/pdf_classifier.py          | 119 ++++-
 src/output/__init__                  |  17 +
 src/output/structured_output.py      | 317 ++++++++++++
 tests/test_structured_output.py      | 319 +++++++++++++
 7 files changed, 928 insertions(+), 723 deletions(-)
 create mode 100644 src/model/models/pdf_classifier.json
 create mode 100644 src/output/__init__
 create mode 100644 src/output/structured_output.py
 create mode 100644 tests/test_structured_output.py

diff --git a/data/labels.json b/data/labels.json
index 8c076c1..c25866b 100644
--- a/data/labels.json
+++ b/data/labels.json
@@ -1,669 +1,22 @@
 {
-    "Abe_1989.txt": "useful",
-    "Adams_1989.txt": "useful",
-    "Agapov_1938.txt": "useful",
-    "Agudelo-Cantero_2015.txt": "useful",
-    "Aiken_1998.txt": "useful",
-    "Alcaraz_2015.txt": "useful",
-    "Alexander_1986.txt": "useful",
-    "Ali_2016.txt": "useful",
-    "Allison_2013.txt": "useful",
-    "Alterio_1997.txt": "useful",
-    "Altin_2015.txt": "useful",
-    "Andersone_2004.txt": "useful",
-    "Anderson_1999.txt": "useful",
-    "Andreone_1999.txt": "useful",
-    "Angelici_2005.txt": "useful",
-    "Angelici_2014.txt": "useful",
-    "Annett_1984.txt": "useful",
-    "Ansell_1999.txt": "useful",
-    "Antonelis_1994.txt": "useful",
-    "Archer_2004.txt": "useful",
-    "Arendt_2001.txt": "useful",
-    "Arrington_2002.txt": "useful",
-    "Aykanat_2020.txt": "useful",
-    "Baeta_2013.txt": "useful",
-    "Baghli_2002.txt": "useful",
-    "Bajkov_1934.txt": "useful",
-    "Bakaloudis_2012.txt": "useful",
-    "Bakus_1959.txt": "useful",
-    "Ballesteros_2009.txt": "useful",
-    "Bandeira_2017.txt": "useful",
-    "Baranovskaya_1935.txt": "useful",
-    "Barbini_2016.txt": "useful",
-    "Barbosa_2015.txt": "useful",
-    "Barrett_2012.txt": "useful",
-    "Baruah_2001.txt": "useful",
-    "Battaglia_2014.txt": "useful",
-    "Battle_1998.txt": "useful",
-    "Beard_2007.txt": "useful",
-    "Beauchamp_1990.txt": "useful",
-    "Beeton_1956.txt": "useful",
-    "Behn_2019.txt": "useful",
-    "Bengtson_1968.txt": "useful",
-    "Bengtsson_2023.txt": "useful",
-    "Berazategui_2007.txt": "useful",
-    "Berchtold_2015.txt": "useful",
-    "Berg_2002.txt": "useful",
-    "Berry_1981.txt": "useful",
-    "Best_1984.txt": "useful",
-    "Best_1986.txt": "useful",
-    "Best_1987.txt": "useful",
-    "Birkeland_1974.txt": "useful",
-    "Birkeland_1982.txt": "useful",
-    "Bisa_2007.txt": "useful",
-    "Blackburn_2006.txt": "useful",
-    "Blanco-Parra_2012.txt": "useful",
-    "Blanc_1998.txt": "useful",
-    "Boelter_2007.txt": "useful",
-    "Boelter_2012.txt": "useful",
-    "Bojsen_2005.txt": "useful",
-    "Bolek_1997.txt": "useful",
-    "Bonham_1941.txt": "useful",
-    "Borcherding_2019.txt": "useful",
-    "Bothma_1966a.txt": "useful",
-    "Bothma_1966b.txt": "useful",
-    "Bothma_1971.txt": "useful",
-    "Bourque_2018.txt": "useful",
-    "Bradstreet_1982.txt": "useful",
-    "Brandao_2003.txt": "useful",
-    "Braune_1987.txt": "useful",
-    "Brewer_1989.txt": "useful",
-    "Brito_2013.txt": "useful",
-    "Brito_2015.txt": "useful",
-    "Brun_1972.txt": "useful",
-    "Buckley_1997.txt": "useful",
-    "Burbank_2022.txt": "useful",
-    "Burghart_2010.txt": "useful",
-    "Bury_1973.txt": "useful",
-    "Bustnes_2001.txt": "useful",
-    "Bwong_2010.txt": "useful",
-    "Cabeceira_2015.txt": "useful",
-    "Cada_1987.txt": "useful",
-    "Caldart_2012.txt": "useful",
-    "Camilo_1993.txt": "useful",
-    "Camino_2023.txt": "useful",
-    "Carey_1972.txt": "useful",
-    "Carmona-Antonanzas_2016.txt": "useful",
-    "Caron_2004.txt": "useful",
-    "Carpenter_1952.txt": "useful",
-    "Carss_1993.txt": "useful",
-    "Cartes_1993.txt": "useful",
-    "Casali_2023.txt": "useful",
-    "Casta\u00f1eda_2006.txt": "useful",
-    "Castilla_1991.txt": "useful",
-    "Castriota_2007.txt": "useful",
-    "Castro_1989.txt": "useful",
-    "Castro_1990.txt": "useful",
-    "Castro_2016.txt": "useful",
-    "Catling_1988.txt": "useful",
-    "Caut_2013.txt": "useful",
-    "Cecala_2007.txt": "useful",
-    "Cerda_1993.txt": "useful",
-    "Chambellant_2013.txt": "useful",
-    "Chelotti_2022.txt": "useful",
-    "Cherel_2003.txt": "useful",
-    "Cherel_2004.txt": "useful",
-    "Chintiroglou_1992.txt": "useful",
-    "Chou_1995.txt": "useful",
-    "Christiansen_2012.txt": "useful",
-    "Clady_1974.txt": "useful",
-    "Clarke_1996.txt": "useful",
-    "Coco_2014.txt": "useful",
-    "Coelho_1997.txt": "useful",
-    "Collard_1970.txt": "useful",
-    "Condit_1984.txt": "useful",
-    "Connell_1961.txt": "useful",
-    "Connell_1970.txt": "useful",
-    "Copson_1986.txt": "useful",
-    "Cordone_2022.txt": "useful",
-    "Costa_2014.txt": "useful",
-    "Crabtree_1998.txt": "useful",
-    "Crawford_2009.txt": "useful",
-    "Cressey_1970.txt": "useful",
-    "Crnobrnja-Isailovi\u0107_2012.txt": "useful",
-    "Croxall_1988.txt": "useful",
-    "Custer_1996.txt": "useful",
-    "Cutter_1958.txt": "useful",
-    "Dale_2011.txt": "useful",
-    "Dayton_1977.txt": "useful",
-    "deAguiar_2004.txt": "useful",
-    "Dearborn_1977.txt": "useful",
-    "Dearborn_1986.txt": "useful",
-    "Dearborn_1991.txt": "useful",
-    "Dearborn_1996.txt": "useful",
-    "Dehn_2006.txt": "useful",
-    "deJuan_2007.txt": "useful",
-    "Delany_1990.txt": "useful",
-    "deOliveiraNeves_2014.txt": "useful",
-    "DeWitt_1972.txt": "useful",
-    "Djait_2019.txt": "useful",
-    "Dominguez_2000.txt": "useful",
-    "Downie_2010.txt": "useful",
-    "Drygala_2013.txt": "useful",
-    "Duggins_1983.txt": "useful",
-    "Dur\u00e9_2001.txt": "useful",
-    "Eichelbaum_1909.txt": "useful",
-    "Elrod_1981.txt": "useful",
-    "Eriksen_2021.txt": "useful",
-    "Espinoza_2015.txt": "useful",
-    "Evans_1996.txt": "useful",
-    "Fagade_1973.txt": "useful",
-    "Fairweather_1985.txt": "useful",
-    "Falke_2024.txt": "useful",
-    "Falk_1992.txt": "useful",
-    "Falk_1993.txt": "useful",
-    "Fanelli_2004.txt": "useful",
-    "Fay_1987.txt": "useful",
-    "Feder_1959.txt": "useful",
-    "Feigenbaum_1979.txt": "useful",
-    "Felix_2006.txt": "useful",
-    "Fenchel_1965.txt": "useful",
-    "Fernandes_2024.txt": "useful",
-    "Ferrari_1989.txt": "useful",
-    "Ferreira_2012.txt": "useful",
-    "Ferreira_2015.txt": "useful",
-    "Fevolden_1982.txt": "useful",
-    "Finley_1983.txt": "useful",
-    "Finley_1990.txt": "useful",
-    "Fisher_2008.txt": "useful",
-    "Fisk_2002.txt": "useful",
-    "Fitz_1991.txt": "useful",
-    "Fleharty_1967.txt": "useful",
-    "Flores_2008.txt": "useful",
-    "Force_1935.txt": "useful",
-    "Ford_1998.txt": "useful",
-    "Frantz_1970.txt": "useful",
-    "Fraser_1970.txt": "useful",
-    "Fratt_1984.txt": "useful",
-    "Freire_1995.txt": "useful",
-    "Freire_1996.txt": "useful",
-    "Fritz_1974.txt": "useful",
-    "Froneman_1998.txt": "useful",
-    "Fujinami_2018.txt": "useful",
-    "Fu_2013.txt": "useful",
-    "Gaborieau_2004.txt": "useful",
-    "Gales_1988.txt": "useful",
-    "Gales_1989.txt": "useful",
-    "Gales_1990.txt": "useful",
-    "Gales_1992.txt": "useful",
-    "Gale_2011.txt": "useful",
-    "Ganmanee_2003.txt": "useful",
-    "Garcia_1988.txt": "useful",
-    "Garda_2007.txt": "useful",
-    "Gaymer_2001.txt": "useful",
-    "Gaymer_2008.txt": "useful",
-    "Gelsleichter_1998.txt": "useful",
-    "Gibbs_2011.txt": "useful",
-    "Gil_2007.txt": "useful",
-    "Gil_2008.txt": "useful",
-    "Giuntoli_1978.txt": "useful",
-    "Glaudas_2008.txt": "useful",
-    "Glorioso_2010.txt": "useful",
-    "Godfrey_1980.txt": "useful",
-    "Gong_2023.txt": "useful",
-    "Gonzalez_1994.txt": "useful",
-    "Gorbatenko_2009.txt": "useful",
-    "Graham_2001.txt": "useful",
-    "Grainger_2020.txt": "useful",
-    "Gray_1994.txt": "useful",
-    "Greenstone_1983.txt": "useful",
-    "Gregory_1978.txt": "useful",
-    "Gregory_1991.txt": "useful",
-    "Gregory_2013.txt": "useful",
-    "Guillemette_1992.txt": "useful",
-    "Gunther_2013.txt": "useful",
-    "Gunzburger_1999.txt": "useful",
-    "Guseinov_2004.txt": "useful",
-    "Hales_2008.txt": "useful",
-    "Hamilton_1951.txt": "useful",
-    "Hamilton_1956.txt": "useful",
-    "Hantak_2016.txt": "useful",
-    "Hardy_2006.txt": "useful",
-    "Harris_2009.txt": "useful",
-    "Hayes_1985.txt": "useful",
-    "Heithaus_2001.txt": "useful",
-    "Henderson_1982.txt": "useful",
-    "Heng_2018.txt": "useful",
-    "Henschel_1994.txt": "useful",
-    "Hesse_2025.txt": "useful",
-    "Himmelman_1991.txt": "useful",
-    "Hindell_1988a.txt": "useful",
-    "Hindell_1988b.txt": "useful",
-    "Hirai_1999.txt": "useful",
-    "Hirai_2000.txt": "useful",
-    "Hirai_2002.txt": "useful",
-    "Hirai_2004.txt": "useful",
-    "Hirschfeld_2011.txt": "useful",
-    "Hockman_1983.txt": "useful",
-    "Holohan_1998.txt": "useful",
-    "Hop_1992.txt": "useful",
-    "Houston_1993.txt": "useful",
-    "Howell_2003.txt": "useful",
-    "Hromada_2003.txt": "useful",
-    "Hsueh_1992.txt": "useful",
-    "Huckembeck_2014.txt": "useful",
-    "Huey_2001.txt": "useful",
-    "Humphries_1992.txt": "useful",
-    "Huseynov_2005.txt": "useful",
-    "Hutton_1987.txt": "useful",
-    "Insley_2021.txt": "useful",
-    "Ito_2009.txt": "useful",
-    "Iverson_2024.txt": "useful",
-    "Jacquemin_2014.txt": "useful",
-    "Jarv_2011.txt": "useful",
-    "Jenson_1960.txt": "useful",
-    "Jeong-Chae_2020.txt": "useful",
-    "Jewett_1982.txt": "useful",
-    "Jewett_1983.txt": "useful",
-    "Jillson_1981.txt": "useful",
-    "Johnson_2015.txt": "useful",
-    "Joyce_2002.txt": "useful",
-    "Jude_1973.txt": "useful",
-    "Juinio_1992.txt": "useful",
-    "Jurajda_2016.txt": "useful",
-    "Kadri_2012.txt": "useful",
-    "Kadye_2012.txt": "useful",
-    "Kalogianni_2010.txt": "useful",
-    "Kam_1998.txt": "useful",
-    "Kangur_1998.txt": "useful",
-    "Kehayias_1996.txt": "useful",
-    "Keough_1979.txt": "useful",
-    "Kephart_1982.txt": "useful",
-    "Keppeler_2013.txt": "useful",
-    "Kerle_2000.txt": "useful",
-    "Keskinen_2004.txt": "useful",
-    "Khan_2013.txt": "useful",
-    "Khan_2014.txt": "useful",
-    "Kidawa_2011.txt": "useful",
-    "Kidera_2008.txt": "useful",
-    "Killengreen_2011.txt": "useful",
-    "Kimmerer_1984.txt": "useful",
-    "King_1982.txt": "useful",
-    "King_2005.txt": "useful",
-    "Klages_1990.txt": "useful",
-    "Klimstra_1959a.txt": "useful",
-    "Klimstra_1959b.txt": "useful",
-    "Kloock_2001.txt": "useful",
-    "Knickle_2013.txt": "useful",
-    "Kock_1994.txt": "useful",
-    "Kofron_1978.txt": "useful",
-    "Kolb_1979.txt": "useful",
-    "Kopecky_2011.txt": "useful",
-    "Kopecky_2012.txt": "useful",
-    "Kopecky_2015.txt": "useful",
-    "Kreiling_2021.txt": "useful",
-    "Krupa_2002.txt": "useful",
-    "Langford_1941.txt": "useful",
-    "Lanszki_2007.txt": "useful",
-    "Lanszki_2015.txt": "useful",
-    "Laptikhovsky_2014.txt": "useful",
-    "Laufer_2021.txt": "useful",
-    "Laur\u00eda-Manzano_2014.txt": "useful",
-    "LeBrasseur_1966.txt": "useful",
-    "Lefebvre_1990.txt": "useful",
-    "Leivas_2012.txt": "useful",
-    "Lemos_2015.txt": "useful",
-    "Leonard_1942.txt": "useful",
-    "Leon_2004.txt": "useful",
-    "Lever_1959.txt": "useful",
-    "Lewis_2014.txt": "useful",
-    "Lima_2010.txt": "useful",
-    "Lima_2022.txt": "useful",
-    "Lin_2008.txt": "useful",
-    "Lisboa_2012.txt": "useful",
-    "Liu_2015.txt": "useful",
-    "Li_2015.txt": "useful",
-    "Loh_2011.txt": "useful",
-    "Longhurst_1957.txt": "useful",
-    "Lonne_1992.txt": "useful",
-    "Louda_1979.txt": "useful",
-    "Lucifora_2005.txt": "useful",
-    "Lugton_1993.txt": "useful",
-    "Luiselli_2004.txt": "useful",
-    "Luiselli_2006.txt": "useful",
-    "Luna_2022.txt": "useful",
-    "Lydersen_1991.txt": "useful",
-    "Lynch_1985.txt": "useful",
-    "L\u00f3pez_2010.txt": "useful",
-    "Macale_2008.txt": "useful",
-    "Macartney_1989.txt": "useful",
-    "Maciel_2012.txt": "useful",
-    "Macy_1982.txt": "useful",
-    "Maeda_2004.txt": "useful",
-    "Magnusdottir_2012.txt": "useful",
-    "Mahan_2007.txt": "useful",
-    "Maia-Carneiro_2012.txt": "useful",
-    "Maia_2006.txt": "useful",
-    "Main_2011.txt": "useful",
-    "Majaneva_2013.txt": "useful",
-    "Marais_1984.txt": "useful",
-    "Marques_2015.txt": "useful",
-    "MarquezVelasquez_2019.txt": "useful",
-    "Martin_1996.txt": "useful",
-    "Maser_1983.txt": "useful",
-    "Mauzey_1966.txt": "useful",
-    "Mauzey_1968.txt": "useful",
-    "Mazzotti_2020.txt": "useful",
-    "McClintock_1985.txt": "useful",
-    "McCluskey_2021.txt": "useful",
-    "McDermott_1965.txt": "useful",
-    "McDermott_1987.txt": "useful",
-    "McDonald_2000.txt": "useful",
-    "McElroy_2006.txt": "useful",
-    "McWilliams_1989.txt": "useful",
-    "Measey_2004.txt": "useful",
-    "Meckstroth_2007.txt": "useful",
-    "Megina_2001.txt": "useful",
-    "Meheust_2015.txt": "useful",
-    "Meinzer_1975.txt": "useful",
-    "Meise_2003.txt": "useful",
-    "Mendonca_2009.txt": "useful",
-    "Menge_1972.txt": "useful",
-    "Menge_1974a.txt": "useful",
-    "Menge_1974b.txt": "useful",
-    "Mercier_2011.txt": "useful",
-    "Mesa_2015.txt": "useful",
-    "Metillo_2011.txt": "useful",
-    "Mikkelsen_2002.txt": "useful",
-    "Milanovich_2008.txt": "useful",
-    "Miller_1990.txt": "useful",
-    "Miller_1995.txt": "useful",
-    "Minello_1989.txt": "useful",
-    "Mitchell_1953.txt": "useful",
-    "Moku_2000.txt": "useful",
-    "Moncada-Rosas_2025.txt": "useful",
-    "Montague_1988.txt": "useful",
-    "Montano_2017.txt": "useful",
-    "Moreira_2014.txt": "useful",
-    "Moreno-Leon_2009.txt": "useful",
-    "Muenz_2008.txt": "useful",
-    "Murie_1992.txt": "useful",
-    "Murphy_1992.txt": "useful",
-    "Murphy_1995.txt": "useful",
-    "Murphy_2004.txt": "useful",
-    "Murphy_2008.txt": "useful",
-    "Mustamaki_2014.txt": "useful",
-    "Mutlu_1999.txt": "useful",
-    "Myhre_1975.txt": "useful",
-    "Nack_2015.txt": "useful",
-    "Nagorsen_1989.txt": "useful",
-    "Nakagawa_2023.txt": "useful",
-    "Nakano_2016.txt": "useful",
-    "Navarrete_2008.txt": "useful",
-    "Nentwig_1985.txt": "useful",
-    "Neves_2009.txt": "useful",
-    "Newman_2010.txt": "useful",
-    "Ngo_2014.txt": "useful",
-    "Ngo_2014a.txt": "useful",
-    "Ngo_2014b.txt": "useful",
-    "Nickels_2023.txt": "useful",
-    "Nielsen_2019.txt": "useful",
-    "Niethammer_1992.txt": "useful",
-    "Nilssen_1995.txt": "useful",
-    "Noda_1992.txt": "useful",
-    "Nogueira-Junior_2008.txt": "useful",
-    "Norman_1992.txt": "useful",
-    "Novak_2010.txt": "useful",
-    "Novak_2013.txt": "useful",
-    "Novak_2017.txt": "useful",
-    "Oh_2001.txt": "useful",
-    "Ojeda_1991.txt": "useful",
-    "Olson_2012.txt": "useful",
-    "Oosten_1938.txt": "useful",
-    "Opacak_2004.txt": "useful",
-    "Ordiano-Flores_2024.txt": "useful",
-    "Ordzie_1980.txt": "useful",
-    "Orejas_2001.txt": "useful",
-    "Orejas_2013.txt": "useful",
-    "Oresland_2000.txt": "useful",
-    "Oskarsson_2016.txt": "useful",
-    "Ottenbacher_1994.txt": "useful",
-    "Ovaska_1991.txt": "useful",
-    "Ozaki_2023.txt": "useful",
-    "Pages_1997.txt": "useful",
-    "Paine_1969.txt": "useful",
-    "Pakhomov_1998.txt": "useful",
-    "Palmer_1988.txt": "useful",
-    "Paltridge_1997.txt": "useful",
-    "Papageorgiou_1994.txt": "useful",
-    "Pardo-Gandarillas_2014.txt": "useful",
-    "Parker_1986.txt": "useful",
-    "Parker_1994.txt": "useful",
-    "Paulissen_1987.txt": "useful",
-    "Paul_1975.txt": "useful",
-    "Pearre_1973.txt": "useful",
-    "Pereira_2018.txt": "useful",
-    "Pereze-Bote_2006.txt": "useful",
-    "Phillips_1991.txt": "useful",
-    "Phillips_2003.txt": "useful",
-    "Pierre_2024.txt": "useful",
-    "Pietersen_2010.txt": "useful",
-    "Pine_2005.txt": "useful",
-    "Pinkas_1970.txt": "useful",
-    "Piontek_2015.txt": "useful",
-    "Pizzatto_2008.txt": "useful",
-    "Plotz_1991.txt": "useful",
-    "Plummer_1981.txt": "useful",
-    "Plummer_1984.txt": "useful",
-    "Plyuscheva_2010.txt": "useful",
-    "Polis_1979.txt": "useful",
-    "Polymeni_2011.txt": "useful",
-    "Pomerleau_2011.txt": "useful",
-    "Poole_1996.txt": "useful",
-    "Portner_2022.txt": "useful",
-    "Pothoven_2015.txt": "useful",
-    "Potier_2007.txt": "useful",
-    "Preston_2012.txt": "useful",
-    "Preti_2001.txt": "useful",
-    "Preti_2004.txt": "useful",
-    "Preti_2008.txt": "useful",
-    "Preti_2012.txt": "useful",
-    "Preti_2020.txt": "useful",
-    "Preti_2023.txt": "useful",
-    "Prokopchuk_2006.txt": "useful",
-    "Pueta_2013.txt": "useful",
-    "Purcell_1981a.txt": "useful",
-    "Purcell_1981b.txt": "useful",
-    "Purcell_1982.txt": "useful",
-    "Purcell_2010.txt": "useful",
-    "Purdy_2004.txt": "useful",
-    "Quesada_2014.txt": "useful",
-    "Quirino_2015.txt": "useful",
-    "Randall_1967.txt": "useful",
-    "Raney_1947.txt": "useful",
-    "Rauschenplat_1901.txt": "useful",
-    "Read_2001.txt": "useful",
-    "Rebou\u00e7as_2013.txt": "useful",
-    "Reid_1956.txt": "useful",
-    "Reis_1996.txt": "useful",
-    "Reis_2020.txt": "useful",
-    "Roberts_2003.txt": "useful",
-    "Robert_1997.txt": "useful",
-    "Robilliard_1971.txt": "useful",
-    "Robinson_2015.txt": "useful",
-    "Rocha_1994.txt": "useful",
-    "Rodger_2021.txt": "useful",
-    "Rodrigues_2019.txt": "useful",
-    "Rodr\u00edguez-Garc\u00eda_2024.txt": "useful",
-    "Rodr\u00edguez-Robles_1999.txt": "useful",
-    "Rohner_2013.txt": "useful",
-    "Romanov_2008.txt": "useful",
-    "Romero_2011.txt": "useful",
-    "Rorbaek_2023.txt": "useful",
-    "Rosas-Luis_2015.txt": "useful",
-    "Rosenthal_1972.txt": "useful",
-    "Rose_2015.txt": "useful",
-    "Rudstam_1992.txt": "useful",
-    "Rysava-Novakova_2009.txt": "useful",
-    "R\u00edo-Garc\u00eda_2014.txt": "useful",
-    "Sallami_2015.txt": "useful",
-    "Salvidio_1999.txt": "useful",
-    "Santander-Neto_2021.txt": "useful",
-    "Santic_2012.txt": "useful",
-    "Santic_2021.txt": "useful",
-    "Santos_1999.txt": "useful",
-    "Sant_Anna_2015.txt": "useful",
-    "Sapounidis_2015.txt": "useful",
-    "Sato_2005.txt": "useful",
-    "Saunders_2015.txt": "useful",
-    "Saunders_2015b.txt": "useful",
-    "Savage_1967.txt": "useful",
-    "Schacht_1995.txt": "useful",
-    "Scheffer_1931.txt": "useful",
-    "Schubert_1986.txt": "useful",
-    "Scolardi_2006.txt": "useful",
-    "Scott_1903(1).txt": "useful",
-    "Scott_1903.txt": "useful",
-    "Sekiguchi_1992.txt": "useful",
-    "Selleslagh_2015.txt": "useful",
-    "Setyobudi_2024.txt": "useful",
-    "Shaiek_2015.txt": "useful",
-    "Shine_1986.txt": "useful",
-    "Shivji_1983.txt": "useful",
-    "Shufeldt_1887.txt": "useful",
-    "Siferd_1992.txt": "useful",
-    "Simila_2022.txt": "useful",
-    "Sinclaire_1992.txt": "useful",
-    "Siqueira_2006.txt": "useful",
-    "Sleeman_1992.txt": "useful",
-    "Slip_1995.txt": "useful",
-    "Sloan_1981.txt": "useful",
-    "Sluyus_2001.txt": "useful",
-    "Smith_1991.txt": "useful",
-    "Smith_2005.txt": "useful",
-    "Smith_2011.txt": "useful",
-    "Smuts_1979.txt": "useful",
-    "Soekoe_2022.txt": "useful",
-    "Sole_2009.txt": "useful",
-    "Sole_2018.txt": "useful",
-    "Soupir_2000.txt": "useful",
-    "Sousa_2015.txt": "useful",
-    "Southern_1941.txt": "useful",
-    "Spadinger_1999.txt": "useful",
-    "Spalding_1971.txt": "useful",
-    "Sprules_1945.txt": "useful",
-    "Steele_1986.txt": "useful",
-    "Steinarsd\u00f3ttir_2009.txt": "useful",
-    "Stewart_2005.txt": "useful",
-    "Strain_2014.txt": "useful",
-    "Stuart_1991.txt": "useful",
-    "Sunderland_1975.txt": "useful",
-    "Surface_1906.txt": "useful",
-    "Szczepanski_2014(1).txt": "useful",
-    "Szczepanski_2014.txt": "useful",
-    "S\u00f3lmundsson_2025.txt": "useful",
-    "Takahashi_1998.txt": "useful",
-    "Takeuchi_2019(1).txt": "useful",
-    "Takeuchi_2019.txt": "useful",
-    "Taniguchi_2005.txt": "useful",
-    "Taylor_1995.txt": "useful",
-    "Teixeira_2004.txt": "useful",
-    "Thompson_2009.txt": "useful",
-    "Thut_1970.txt": "useful",
-    "Tickell_2021.txt": "useful",
-    "Tidemann_1994.txt": "useful",
-    "TinHan_2021.txt": "useful",
-    "Tokeshi_1989.txt": "useful",
-    "Tonay_2016.txt": "useful",
-    "Tong_1986.txt": "useful",
-    "Tonnesson_2005.txt": "useful",
-    "Torres-Rojas_2009.txt": "useful",
-    "Townhill_2021.txt": "useful",
-    "Town_1980.txt": "useful",
-    "Tremblay-Gagnon_2023.txt": "useful",
-    "Tuttle_2009.txt": "useful",
-    "Uhler_1939.txt": "useful",
-    "Ulloa_2006.txt": "useful",
-    "Usman_2018.txt": "useful",
-    "Valderrama-Vernaza_2009.txt": "useful",
-    "Valdmann_1998.txt": "useful",
-    "Valdmann_2005.txt": "useful",
-    "Valls_2015.txt": "useful",
-    "Vanegas_2016.txt": "useful",
-    "VanHeezik_1990.txt": "useful",
-    "VanHyning_1932.txt": "useful",
-    "Varghese_2014.txt": "useful",
-    "Viana_2014.txt": "useful",
-    "Voris_1980.txt": "useful",
-    "Vrcibradic_2009.txt": "useful",
-    "Wallace_1990.txt": "useful",
-    "Wang_2012.txt": "useful",
-    "Wang_2023.txt": "useful",
-    "Ward_1988.txt": "useful",
-    "Washburn_2013.txt": "useful",
-    "Watanabe_2004.txt": "useful",
-    "Wear_1987.txt": "useful",
-    "Weber_1989.txt": "useful",
-    "Webster_1943.txt": "useful",
-    "Weidel_2000.txt": "useful",
-    "Wells_1961.txt": "useful",
-    "Wen_2012.txt": "useful",
-    "West_1986.txt": "useful",
-    "West_1988.txt": "useful",
-    "Whiles_2004.txt": "useful",
-    "White_1992.txt": "useful",
-    "Whitney_2024.txt": "useful",
-    "Wieters_2000.txt": "useful",
-    "Wilcox_2015.txt": "useful",
-    "Witman_2010.txt": "useful",
-    "Wood_1965.txt": "useful",
-    "Wright_1915.txt": "useful",
-    "Wu_2005.txt": "useful",
-    "Xavier_2002.txt": "useful",
-    "Xavier_2010.txt": "useful",
-    "Yamaguchi_2011.txt": "useful",
-    "Yang_2011.txt": "useful",
-    "Yatabe_2010.txt": "useful",
-    "Young_1986.txt": "useful",
-    "Zhou_2004.txt": "useful",
-    "Zunnna_2009.txt": "useful",
-    "Ainley_2005.txt": "not useful",
-    "AlejoPlata_2019.txt": "not useful",
-    "Allen_1942.txt": "not useful",
-    "Andersen_2021.txt": "not useful",
-    "Aznar_1994.txt": "not useful",
-    "Bajkov_1930.txt": "not useful",
-    "Balcombe_2005.txt": "not useful",
-    "Barry_1996.txt": "not useful",
-    "Belon_1555.txt": "not useful",
-    "Boesel_1938.txt": "not useful",
-    "Carol_2009.txt": "not useful",
-    "Chacko_1949.txt": "not useful",
-    "Choi_2022.txt": "not useful",
-    "Cormack_2023.txt": "not useful",
-    "Decker_2025.txt": "not useful",
-    "DiBeneditto_2014.txt": "not useful",
-    "Dymond_1929.txt": "not useful",
-    "Elwood_1969.txt": "not useful",
-    "Feng_2023.txt": "not useful",
-    "Ferreira_1999.txt": "not useful",
-    "Fries_1892.txt": "not useful",
-    "Gamez_2022.txt": "not useful",
-    "Hahn_2014.txt": "not useful",
-    "Islam_2018.txt": "not useful",
-    "Leonard_1940.txt": "not useful",
-    "Leonard_1949.txt": "not useful",
-    "Leuchtenberger_2020.txt": "not useful",
-    "Long_2000.txt": "not useful",
-    "Lowndess_1935.txt": "not useful",
-    "Martin_1995.txt": "not useful",
-    "McMahon_1999.txt": "not useful",
-    "Monadjem_1996.txt": "not useful",
-    "Ng_2021.txt": "not useful",
-    "Ohizumi_2003.txt": "not useful",
-    "Pereira_2017.txt": "not useful",
-    "Portner_2025.txt": "not useful",
-    "Pritchard_1944.txt": "not useful",
-    "Rosalino_2009.txt": "not useful",
-    "Simone_2022.txt": "not useful",
-    "Teillard_2024.txt": "not useful",
-    "Troina_2016.txt": "not useful",
-    "Uyeno_1991.txt": "not useful",
-    "Verrill_1871.txt": "not useful",
-    "Wright_1927.txt": "not useful",
-    "Wurtsbaugh_1975.txt": "not useful",
-    "Yoshino_2020.txt": "not useful",
-    "Yu_2022.txt": "not useful",
-    "Zheltenkova_1938.txt": "not useful"
+  "Adams_1989.txt": "useful",
+  "Berg_2002.txt": "useful",
+  "Dale_2011.txt": "useful",
+  "Fisher_2008.txt": "useful",
+  "Harris_2009.txt": "useful",
+  "Kerle_2000.txt": "useful",
+  "Marques_2015.txt": "useful",
+  "Pakhomov_1998.txt": "useful",
+  "Sousa_2015.txt": "useful",
+  "Wu_2005.txt": "useful",
+  "AlejoPlata_2019.txt": "not-useful",
+  "Decker_2025.txt": "not-useful",
+  "Ferreira_1999.txt": "not-useful",
+  "Gamez_2022.txt": "not-useful",
+  "Long_2000.txt": "not-useful",
+  "Ng_2021.txt": "not-useful",
+  "Ohizumi_2003.txt": "not-useful",
+  "Rosalino_2009.txt": "not-useful",
+  "Simone_2022.txt": "not-useful",
+  "Yoshino_2020.txt": "not-useful"
 }
\ No newline at end of file
diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py
index 95c2de2..7651a54 100644
--- a/scripts/full_pipeline.py
+++ b/scripts/full_pipeline.py
@@ -2,7 +2,7 @@
 
 Modes:
  - API mode: Download PDFs from Google Drive and process them
- - Local mode: Use PDFs already downloaded locally
+ - Local mode: Use PDFs already downloaded locally (DEFAULT)
 
 API Mode Environment variables:
  - GOOGLE_SERVICE_ACCOUNT_JSON (service account JSON string)
@@ -10,14 +10,15 @@
  - GOOGLE_DRIVE_USE_SHARED_DRIVE=true (if using shared drives / shared folders)
 
 Usage:
+ - Default (local): python full_pipeline.py
  - API mode: python full_pipeline.py --api
- - Local mode: python full_pipeline.py --local <path_to_pdfs>
+ - Custom path: python full_pipeline.py --local C:\\path\\to\\data
 
 Behavior:
- - API mode: Streams every PDF (no local PDF persistence) and writes extracted text to data/processed-text.
- - Local mode: Processes PDFs from specified local directory (expects 'useful' and 'not-useful' subfolders).
- - Generates labels.json based on folder origin.
- - Trains model with src/model/train_model.py.
+ - Processes PDFs from data/useful and data/not-useful folders
+ - Generates labels.json based on folder origin
+ - Trains model with src/model/train_model.py
+ - Automatically generates classification results (CSV & JSON)
 """
 
 from __future__ import annotations
@@ -30,22 +31,31 @@
 import subprocess
 import sys
 
-import sys
-from pathlib import Path as _Path2
-
-sys.path.append(str(_Path2(__file__).resolve().parents[1]))  # add repo root to sys.path
+# Project setup - MUST be before other imports
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+os.chdir(PROJECT_ROOT)
+sys.path.insert(0, str(PROJECT_ROOT))
 
-from scripts.env_loader import load_env
+# Load environment (for API mode)
+try:
+    from scripts.env_loader import load_env
+    load_env()
+except:
+    pass
 
-load_env()  # Load .env file if present (for local dev)
+# Import Google Drive modules (only needed for API mode)
+try:
+    from scripts.google_drive.drive_io import (
+        get_drive_service,
+        find_child_folder_id,
+        list_pdfs_in_folder,
+        download_file_bytes,
+        sanitize_filename,
+    )
+    GOOGLE_DRIVE_AVAILABLE = True
+except ImportError:
+    GOOGLE_DRIVE_AVAILABLE = False
 
-from scripts.google_drive.drive_io import (
-    get_drive_service,
-    find_child_folder_id,
-    list_pdfs_in_folder,
-    download_file_bytes,
-    sanitize_filename,
-)
 from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes
 
 
@@ -63,10 +73,15 @@ def write_labels(labels: Dict[str, str], output_file: Path):
 
 
 def process_api_mode():
-    """Download PDFs from Google Drive and process them."""
-    root_id = os.environ.get("GOOGLE_DRIVE_ROOT_FOLDER_ID")
+    #Download PDFs from Google Drive and process them
+    if not GOOGLE_DRIVE_AVAILABLE:
+        print("ERROR: Google Drive modules not available.")
+        print("Please install: pip install google-auth google-api-python-client")
+        return False
+        
+    root_id = os.environ.get("GOOGLE_DRIVE_FOLDER_ID")
     if not root_id:
-        raise RuntimeError("Missing GOOGLE_DRIVE_ROOT_FOLDER_ID environment variable")
+        raise RuntimeError("Missing GOOGLE_DRIVE_FOLDER_ID environment variable")
 
     service = get_drive_service()
     useful_id = find_child_folder_id(service, root_id, "useful")
@@ -76,10 +91,11 @@ def process_api_mode():
     if not not_useful_id:
         raise RuntimeError(f"Could not find 'not-useful' subfolder under root folder {root_id}")
 
-    out_dir = Path("data/processed-text")
+    out_dir = PROJECT_ROOT / "data" / "processed-text"
     out_dir.mkdir(parents=True, exist_ok=True)
     labels: Dict[str, str] = {}
-    count=1
+    count = 1
+    
     for folder_id, label in [(useful_id, "useful"), (not_useful_id, "not-useful")]:
         files = list_pdfs_in_folder(service, folder_id, max_files=None)
         print(f"Found {len(files)} PDFs in folder label '{label}'")
@@ -91,14 +107,15 @@ def process_api_mode():
             (out_dir / txt_name).write_text(text, encoding="utf-8")
             labels[txt_name] = label
             print(f"{count} Processed {f['name']}")
-            count+=1
+            count += 1
 
-    write_labels(labels, Path("data/labels.json"))
+    write_labels(labels, PROJECT_ROOT / "data" / "labels.json")
     print(f"Wrote {len(labels)} labeled text files.")
+    return True
 
 
 def process_local_mode(data_path: Path):
-    """Process PDFs from local directory."""
+    #Process PDFs from local directory
     if not data_path.exists():
         raise RuntimeError(f"Data path does not exist: {data_path}")
     
@@ -110,13 +127,28 @@ def process_local_mode(data_path: Path):
     if not not_useful_dir.exists():
         raise RuntimeError(f"'not-useful' subfolder not found in {data_path}")
     
-    out_dir = Path("data/processed-text")
+    # Validate sufficient PDFs
+    useful_pdfs = list(useful_dir.glob("*.pdf"))
+    not_useful_pdfs = list(not_useful_dir.glob("*.pdf"))
+    
+    print(f"Found {len(useful_pdfs)} PDFs in 'useful' folder")
+    print(f"Found {len(not_useful_pdfs)} PDFs in 'not-useful' folder")
+    
+    if len(useful_pdfs) < 2 or len(not_useful_pdfs) < 2:
+        print("ERROR: Not enough PDF files!")
+        print(f"Please add PDF files to:")
+        print(f"  - {useful_dir}")
+        print(f"  - {not_useful_dir}")
+        print("You need at least 2 PDFs in each folder.")
+        sys.exit(1)
+    
+    out_dir = PROJECT_ROOT / "data" / "processed-text"
     out_dir.mkdir(parents=True, exist_ok=True)
     labels: Dict[str, str] = {}
     
     for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]:
         pdf_files = list(folder.glob("*.pdf"))
-        print(f"Found {len(pdf_files)} PDFs in local folder '{label}'")
+        print(f"Processing {len(pdf_files)} PDFs in local folder '{label}'")
         
         for pdf_path in pdf_files:
             try:
@@ -132,8 +164,52 @@ def process_local_mode(data_path: Path):
                 print(f"Error processing {pdf_path.name}: {e}")
                 continue
     
-    write_labels(labels, Path("data/labels.json"))
+    write_labels(labels, PROJECT_ROOT / "data" / "labels.json")
     print(f"Wrote {len(labels)} labeled text files.")
+    return True
+
+
+def generate_results():
+    #Generate classification results CSV & JSON using subprocess for reliability
+    print("\n" + "=" * 50)
+    print("Generating classification results (CSV & JSON)...")
+    print("=" * 50)
+    
+    useful_folder = PROJECT_ROOT / "data" / "useful"
+    output_dir = PROJECT_ROOT / "data" / "results"
+    model_dir = PROJECT_ROOT / "src" / "model" / "models"
+    classifier_script = PROJECT_ROOT / "src" / "model" / "pdf_classifier.py"
+    
+    # Create results directory if it doesn't exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    if useful_folder.exists() and list(useful_folder.glob("*.pdf")):
+        print(f"\nClassifying PDFs in: {useful_folder}")
+        
+        # Use subprocess for more reliable imports across different systems
+        result = subprocess.run([
+            sys.executable,
+            str(classifier_script),
+            "--folder", str(useful_folder),
+            "--model_dir", str(model_dir),
+            "--output_dir", str(output_dir)
+        ], cwd=str(PROJECT_ROOT))
+        
+        if result.returncode == 0:
+            print("\n" + "=" * 50)
+            print("OUTPUT FILES CREATED:")
+            print("=" * 50)
+            csv_file = output_dir / "classifications.csv"
+            json_file = output_dir / "classifications.json"
+            if csv_file.exists():
+                print(f"  - {csv_file}")
+            if json_file.exists():
+                print(f"  - {json_file}")
+            print("\nYou can open the CSV file!")
+        else:
+            print("\n[WARNING] Classification had some issues. Check output above.")
+    else:
+        print("\nNo PDFs found in useful folder to classify.")
 
 
 def main():
@@ -142,38 +218,61 @@ def main():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-  API mode:   python full_pipeline.py --api
-  Local mode: python full_pipeline.py --local ./data/pdfs
+  Default (local): python full_pipeline.py
+  API mode:        python full_pipeline.py --api
+  Custom path:     python full_pipeline.py --local C:\\path\\to\\data
         """
     )
     
-    # Create mutually exclusive group for --api and --local
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
+    parser.add_argument(
         "--api",
         action="store_true",
         help="Use API mode to download PDFs from Google Drive"
     )
-    group.add_argument(
+    parser.add_argument(
         "--local",
         type=Path,
         metavar="PATH",
-        help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)"
+        default=None,
+        help="Use local mode with PDFs from specified directory (default: data/)"
     )
     
     args = parser.parse_args()
     
-    if args.local:
-        print(f"Running in LOCAL mode with data path: {args.local}")
-        process_local_mode(args.local)
-    else:  # args.api
-        print("Running in API mode (Google Drive)")
-        process_api_mode()
+    print("=" * 50)
+    print("FracFeedExtractor - Full Training Pipeline")
+    print("=" * 50)
+    print(f"Project folder: {PROJECT_ROOT}")
     
+    if args.api:
+        print("\nRunning in API mode (Google Drive)")
+        success = process_api_mode()
+        if not success:
+            sys.exit(1)
+    else:
+        data_path = args.local if args.local else PROJECT_ROOT / "data"
+        print(f"\nRunning in LOCAL mode")
+        print(f"Data path: {data_path}")
+        process_local_mode(data_path)
+    
+    print("\n" + "=" * 50)
     print("Beginning model training...")
-    run([sys.executable, "src/model/train_model.py"])
-    print("Training complete.")
+    print("=" * 50)
+    train_script = PROJECT_ROOT / "src" / "model" / "train_model.py"
+    run([sys.executable, str(train_script)])
+    
+    print("\n" + "=" * 50)
+    print("TRAINING COMPLETE!")
+    print("=" * 50)
+    print(f"Model saved to: {PROJECT_ROOT / 'src' / 'model' / 'models'}")
+    
+    # Generate CSV/JSON results
+    generate_results()
+    
+    print("\n" + "=" * 50)
+    print("All Done!")
+    print("=" * 50)
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/src/model/models/pdf_classifier.json b/src/model/models/pdf_classifier.json
new file mode 100644
index 0000000..4c76f05
--- /dev/null
+++ b/src/model/models/pdf_classifier.json
@@ -0,0 +1 @@
+{"learner":{"attributes":{"best_iteration":"0","best_score":"0.7031644284725189"},"feature_names":[],"feature_types":[],"gradient_booster":{"model":{"cats":{"enc":[],"feature_segments":[],"sorted_idx":[]},"gbtree_model_param":{"num_parallel_tree":"1","num_trees":"21"},"iteration_indptr":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21],"tree_info":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"trees":[{"base_weights":[-0E0,-4.5454547E-2,3.3333335E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":0,"left_children":[1,-1,-1],"loss_changes":[3.6060605E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-4.5454547E-2,3.3333335E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.75E0,1.75E0,2E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.6550421E-2,4.440188E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":1,"left_children":[1,-1,-1],"loss_changes":[2.9434924E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.6550421E-2,4.440188E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.4987297E0,1.7492157E0,1.7495139E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.5713807E-2,3.1201271E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":2,"left_children":[1,-1,-1],"loss_changes":[1.6023228E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[3.810747E-3,-2.5713807E-2,3.1201271E-2],"split_indices":[386,0,0],"split_type":[0,0,0],"sum_hessian":[2.9964888E0,1.7483754E0,1.2481135E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7401117E-2,2.5258869E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":3,"left_children":[1,-1,-1],"loss_changes":[2.0981195E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[2.056987E-2,-3.7401117E-2,2.5258869E-2],"split_indices":[537,0,0],"split_type":[0,0,0],"sum_hessian":[3.243742E0,1.4969167E0,1.7468252E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.6906194E-2,4.2121045E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":4,"left_children":[1,-1,-1],"loss_changes":[3.306724E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.6906194E-2,4.2121045E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2390664E0,1.4950526E0,1.7440138E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-1.04854666E-1,-4.094534E-2,1.6049905E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":5,"left_children":[1,-1,-1],"loss_changes":[2.0475786E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-4.094534E-2,1.6049905E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2317743E0,1.7397995E0,1.491975E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.9208884E-2,3.4704812E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":6,"left_children":[1,-1,-1],"loss_changes":[2.2169628E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.9208884E-2,3.4704812E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.472232E0,1.9848684E0,1.4873638E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.8985728E-2,3.3738654E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":7,"left_children":[1,-1,-1],"loss_changes":[1.882357E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.8985728E-2,3.3738654E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.720318E0,1.2380944E0,1.4822236E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.214106E-2,3.7840564E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":8,"left_children":[1,-1,-1],"loss_changes":[2.0949135E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.214106E-2,3.7840564E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.4542952E0,1.7320973E0,1.7221978E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-1.4408588E-2,3.178102E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":9,"left_children":[1,-1,-1],"loss_changes":[1.2034733E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-1.4408588E-2,3.178102E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.949526E0,1.4806063E0,1.4689196E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.2085627E-2,3.265416E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":10,"left_children":[1,-1,-1],"loss_changes":[2.0702496E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.2085627E-2,3.265416E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.9391243E0,1.4706066E0,1.4685178E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.4657268E-2,1.3228178E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":11,"left_children":[1,-1,-1],"loss_changes":[7.1198344E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[2.056987E-2,-2.4657268E-2,1.3228178E-2],"split_indices":[537,0,0],"split_type":[0,0,0],"sum_hessian":[2.6867905E0,1.2167447E0,1.4700457E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7201285E-2,2.9706245E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":12,"left_children":[1,-1,-1],"loss_changes":[2.3646326E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-3.7201285E-2,2.9706245E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.1595213E0,1.7090204E0,1.4505011E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.9499192E-2,3.9527934E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":13,"left_children":[1,-1,-1],"loss_changes":[2.6844969E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.9499192E-2,3.9527934E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.379565E0,1.447219E0,1.932346E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.8973341E-2,1.7765379E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":14,"left_children":[1,-1,-1],"loss_changes":[1.1598339E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.8973341E-2,1.7765379E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.1344721E0,1.4417509E0,1.6927214E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.3713494E-2,2.7436396E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":15,"left_children":[1,-1,-1],"loss_changes":[1.3900027E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.3448534E-3,-2.3713494E-2,2.7436396E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.3583946E0,1.9331914E0,1.4252031E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-1.7263865E-2,2.1163303E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":16,"left_children":[1,-1,-1],"loss_changes":[7.109931E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-1.7263865E-2,2.1163303E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[2.8661036E0,1.6826108E0,1.1834928E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.7158424E-2,2.1919927E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":17,"left_children":[1,-1,-1],"loss_changes":[1.2738018E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.7158424E-2,2.1919927E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.332507E0,1.4205837E0,1.911923E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-3.7158858E-2,2.5633803E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":18,"left_children":[1,-1,-1],"loss_changes":[2.2300358E0,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.3448534E-3,-3.7158858E-2,2.5633803E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.2962039E0,1.8948877E0,1.4013162E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-0E0,-2.551791E-2,1.628439E-2],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[0,0,0],"id":19,"left_children":[1,-1,-1],"loss_changes":[9.076358E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[5.658831E-3,-2.551791E-2,1.628439E-2],"split_indices":[691,0,0],"split_type":[0,0,0],"sum_hessian":[3.0655773E0,1.3985367E0,1.6670406E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}},{"base_weights":[-1.558051E-1,-3.1970523E-2,2.2381728E-3],"categories":[],"categories_nodes":[],"categories_segments":[],"categories_sizes":[],"default_left":[1,0,0],"id":20,"left_children":[1,-1,-1],"loss_changes":[9.912287E-1,0E0,0E0],"parents":[2147483647,0,0],"right_children":[2,-1,-1],"split_conditions":[6.957521E-3,-3.1970523E-2,2.2381728E-3],"split_indices":[352,0,0],"split_type":[0,0,0],"sum_hessian":[2.7935517E0,1.6391398E0,1.1544119E0],"tree_param":{"num_deleted":"0","num_feature":"10000","num_nodes":"3","size_leaf_vector":"1"}}]},"name":"gbtree"},"learner_model_param":{"base_score":"[5E-1]","boost_from_average":"1","num_class":"0","num_feature":"10000","num_target":"1"},"objective":{"name":"binary:logistic","reg_loss_param":{"scale_pos_weight":"1"}}},"version":[3,1,2]}
\ No newline at end of file
diff --git a/src/model/pdf_classifier.py b/src/model/pdf_classifier.py
index 8b9c2ba..e4c661a 100644
--- a/src/model/pdf_classifier.py
+++ b/src/model/pdf_classifier.py
@@ -3,20 +3,46 @@
 from pathlib import Path
 import xgboost as xgb
 import sys
+import os
+import time
+
+# Setup project root path, must be before other imports
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+os.chdir(PROJECT_ROOT)
 
-sys.path.append(str(Path(__file__).resolve().parents[2]))
 from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
 
+# Try to import structured output module
+try:
+    from src.output.structured_output import ClassificationResult, OutputManager
+    STRUCTURED_OUTPUT_AVAILABLE = True
+except ImportError as e:
+    print(f"[WARNING] Could not import structured_output: {e}")
+    STRUCTURED_OUTPUT_AVAILABLE = False
+    ClassificationResult = None
+    OutputManager = None
+
 
-# Classify a single PDF as useful or not useful based on its text content.
-def classify_pdf(pdf_path, model_dir="src/model/models"):
+def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False):
+    #Classify a single PDF as useful or not useful.
+   
+    start_time = time.time()
     model_path = Path(model_dir) / "pdf_classifier.json"
     vectorizer_path = Path(model_dir) / "tfidf_vectorizer.pkl"
     encoder_path = Path(model_dir) / "label_encoder.pkl"
+    filename = Path(pdf_path).name
 
     if not model_path.exists() or not vectorizer_path.exists() or not encoder_path.exists():
         print(f"[ERROR] Missing model, encoder, or vectorizer in {model_dir}")
-        return
+        if return_result and STRUCTURED_OUTPUT_AVAILABLE:
+            return ClassificationResult(
+                filename=filename,
+                classification="unknown",
+                confidence=0.0,
+                error=f"Missing model files in {model_dir}",
+            )
+        return None
 
     # Load model, encoder, and TF-IDF vectorizer
     model = xgb.Booster()
@@ -28,14 +54,21 @@ def classify_pdf(pdf_path, model_dir="src/model/models"):
     text = extract_text_from_pdf(pdf_path)
     if not text.strip():
         print(f"[ERROR] No text extracted from {pdf_path}. Skipping classification.")
-        return
+        if return_result and STRUCTURED_OUTPUT_AVAILABLE:
+            return ClassificationResult(
+                filename=filename,
+                classification="unknown",
+                confidence=0.0,
+                error="No text extracted from PDF",
+            )
+        return None
 
     # Transform text into vectorized TF-IDF format
     X_vec = vectorizer.transform([text])
 
     # Wrap in DMatrix for XGBoost prediction
     dtest = xgb.DMatrix(X_vec)
-    pred_prob = model.predict(dtest)[0]
+    pred_prob = float(model.predict(dtest)[0])
     pred_class = 1 if pred_prob >= 0.70 else 0
 
     # Convert numeric class back into original label name
@@ -46,16 +79,82 @@ def classify_pdf(pdf_path, model_dir="src/model/models"):
     else:
         confidence = pred_prob
 
+    processing_time = time.time() - start_time
+
     print("\n=== PDF Classification Result ===")
-    print(f" File: {Path(pdf_path).name}")
+    print(f" File: {filename}")
     print(f" Prediction: {pred_label} ({confidence:.2%} confidence)")
     print("=================================\n")
 
+    if return_result and STRUCTURED_OUTPUT_AVAILABLE:
+        return ClassificationResult(
+            filename=filename,
+            classification=pred_label,
+            confidence=float(confidence),
+            processing_time_seconds=processing_time,
+            text_length=len(text),
+        )
+    return None
+
+
+def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/results"):
+    #Classify all PDFs in a folder and export results.
+    
+    if not STRUCTURED_OUTPUT_AVAILABLE:
+        print("[ERROR] Structured output module not available.")
+        print("Make sure src/output/structured_output.py exists.")
+        return {}
+    
+    folder = Path(folder_path)
+    if not folder.exists():
+        print(f"[ERROR] Folder not found: {folder_path}")
+        return {}
+    
+    pdf_files = list(folder.glob("*.pdf"))
+    if not pdf_files:
+        print(f"[WARN] No PDF files found in {folder_path}")
+        return {}
+    
+    print(f"Found {len(pdf_files)} PDF files to classify.")
+    
+    manager = OutputManager(output_dir=output_dir)
+    
+    for i, pdf_path in enumerate(pdf_files, 1):
+        print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_path.name}")
+        result = classify_pdf(str(pdf_path), model_dir=model_dir, return_result=True)
+        if result:
+            manager.add_classification(result)
+    
+    # Export results
+    paths = manager.export_all()
+    
+    # Print summary
+    print("\n=== Classification Summary ===")
+    summary = manager.get_summary()
+    print(f" Total files: {summary['total_classifications']}")
+    print(f" Useful: {summary['useful_count']}")
+    print(f" Not useful: {summary['not_useful_count']}")
+    print(f" Avg confidence: {summary['average_classification_confidence']:.2%}")
+    print("==============================\n")
+    
+    return paths
+
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Classify a PDF as useful or not useful.")
-    parser.add_argument("--pdf-path", type=str, help="Path to the PDF file to classify.")
+    parser = argparse.ArgumentParser(description="Classify PDFs as useful or not useful.")
+    parser.add_argument("--pdf-path", type=str, help="Path to a single PDF file to classify.")
+    parser.add_argument("--folder", type=str, help="Path to a folder of PDFs to classify.")
     parser.add_argument("--model_dir", type=str, default="src/model/models", help="Directory containing the trained model and TF-IDF vectorizer.")
+    parser.add_argument("--output_dir", type=str, default="data/results", help="Directory for output files (JSON/CSV).")
     args = parser.parse_args()
 
-    classify_pdf(args.pdf_path, args.model_dir)
+    if args.folder:
+        paths = classify_folder(args.folder, args.model_dir, args.output_dir)
+        if paths:
+            print("Exported files:")
+            for name, path in paths.items():
+                print(f"  {name}: {path}")
+    elif args.pdf_path:
+        classify_pdf(args.pdf_path, args.model_dir)
+    else:
+        parser.print_help()
\ No newline at end of file
diff --git a/src/output/__init__ b/src/output/__init__
new file mode 100644
index 0000000..e38d94c
--- /dev/null
+++ b/src/output/__init__
@@ -0,0 +1,17 @@
+"""Output module for structured data export."""
+
+from .structured_output import (
+    ClassificationResult,
+    ExtractionResult,
+    OutputManager,
+    export_to_json,
+    export_to_csv,
+)
+
+__all__ = [
+    "ClassificationResult",
+    "ExtractionResult",
+    "OutputManager",
+    "export_to_json",
+    "export_to_csv",
+]
diff --git a/src/output/structured_output.py b/src/output/structured_output.py
new file mode 100644
index 0000000..dfeca0e
--- /dev/null
+++ b/src/output/structured_output.py
@@ -0,0 +1,317 @@
+"""Structured Output Module 
+
+This module handles the export of classification and data extraction results
+to JSON and CSV formats with clear provenance and uncertainty tracking.
+
+Usage:
+    from src.output import OutputManager, ClassificationResult
+
+    # Create a classification result
+    result = ClassificationResult(
+        filename="Adams_1989.pdf",
+        classification="useful",
+        confidence=0.92,
+        model_version="1.0.0"
+    )
+
+    # Export results
+    manager = OutputManager(output_dir="data/results")
+    manager.add_classification(result)
+    manager.export_all()
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+
+@dataclass
+class ClassificationResult:
+    #Stores the result of classifying a single PDF.
+
+    filename: str
+    classification: str
+    confidence: float
+    model_version: str = "1.0.0"
+    processing_time_seconds: Optional[float] = None
+    timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
+    text_length: Optional[int] = None
+    error: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        #Convert to dictionary for JSON serialization
+        return asdict(self)
+
+
+@dataclass
+class ExtractionResult:
+    #Stores extracted data from a 'useful' PDF
+    filename: str
+    predator_species: Optional[str] = None
+    predator_common_name: Optional[str] = None
+    survey_location: Optional[str] = None
+    survey_latitude: Optional[float] = None
+    survey_longitude: Optional[float] = None
+    survey_year: Optional[int] = None
+    survey_month: Optional[int] = None
+    total_stomachs_examined: Optional[int] = None
+    empty_stomachs: Optional[int] = None
+    non_empty_stomachs: Optional[int] = None
+    fraction_feeding: Optional[float] = None
+    sample_size_confidence: Optional[float] = None
+    extraction_confidence: Optional[float] = None
+    extraction_notes: Optional[str] = None
+    source_text_snippet: Optional[str] = None
+    timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
+    extractor_version: str = "1.0.0"
+    error: Optional[str] = None
+
+    def __post_init__(self):
+        #Calculate fraction_feeding if stomach counts are available
+        if (
+            self.fraction_feeding is None
+            and self.total_stomachs_examined is not None
+            and self.non_empty_stomachs is not None
+            and self.total_stomachs_examined > 0
+        ):
+            self.fraction_feeding = self.non_empty_stomachs / self.total_stomachs_examined
+
+    def to_dict(self) -> Dict[str, Any]:
+        #Convert to dictionary for JSON serialization
+        return asdict(self)
+
+
+@dataclass
+class PipelineResult:
+    #Combined result from the full pipeline (classification and extraction).
+
+    filename: str
+    classification: ClassificationResult
+    extraction: Optional[ExtractionResult] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        #Convert to dictionary for JSON serialization
+        result = {
+            "filename": self.filename,
+            "classification": self.classification.to_dict(),
+        }
+        if self.extraction:
+            result["extraction"] = self.extraction.to_dict()
+        return result
+
+
+class OutputManager:
+    #Manages collection and export of pipeline results.
+
+    def __init__(self, output_dir: str = "data/results"):
+        #Initialize the OutputManager.
+
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.classifications: List[ClassificationResult] = []
+        self.extractions: List[ExtractionResult] = []
+        self.pipeline_results: List[PipelineResult] = []
+
+    def add_classification(self, result: ClassificationResult) -> None:
+        #Add a classification result to the collection
+        self.classifications.append(result)
+
+    def add_extraction(self, result: ExtractionResult) -> None:
+        #Add an extraction result to the collection.
+        self.extractions.append(result)
+
+    def add_pipeline_result(self, result: PipelineResult) -> None:
+        #Add a complete pipeline result to the collection
+        self.pipeline_results.append(result)
+        self.classifications.append(result.classification)
+        if result.extraction:
+            self.extractions.append(result.extraction)
+
+    def export_classifications_json(self, filename: str = "classifications.json") -> Path:
+        #Export classification results to JSON.
+        output_path = self.output_dir / filename
+        data = {
+            "metadata": {
+                "export_timestamp": datetime.utcnow().isoformat(),
+                "total_files": len(self.classifications),
+                "useful_count": sum(1 for c in self.classifications if c.classification == "useful"),
+                "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"),
+            },
+            "results": [c.to_dict() for c in self.classifications],
+        }
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+        print(f"[INFO] Classifications exported to {output_path}")
+        return output_path
+
+    def export_classifications_csv(self, filename: str = "classifications.csv") -> Path:
+        #Export classification results to CSV.
+        output_path = self.output_dir / filename
+        if not self.classifications:
+            print("[WARN] No classifications to export.")
+            return output_path
+
+        fieldnames = list(self.classifications[0].to_dict().keys())
+        with open(output_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for result in self.classifications:
+                writer.writerow(result.to_dict())
+        print(f"[INFO] Classifications exported to {output_path}")
+        return output_path
+
+    def export_extractions_json(self, filename: str = "extractions.json") -> Path:
+        #Export extraction results to JSON.
+        output_path = self.output_dir / filename
+        data = {
+            "metadata": {
+                "export_timestamp": datetime.utcnow().isoformat(),
+                "total_extractions": len(self.extractions),
+                "successful_extractions": sum(1 for e in self.extractions if e.error is None),
+            },
+            "results": [e.to_dict() for e in self.extractions],
+        }
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+        print(f"[INFO] Extractions exported to {output_path}")
+        return output_path
+
+    def export_extractions_csv(self, filename: str = "extractions.csv") -> Path:
+        #Export extraction results to CSV.
+        output_path = self.output_dir / filename
+        if not self.extractions:
+            print("[WARN] No extractions to export.")
+            return output_path
+
+        fieldnames = list(self.extractions[0].to_dict().keys())
+        with open(output_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for result in self.extractions:
+                writer.writerow(result.to_dict())
+        print(f"[INFO] Extractions exported to {output_path}")
+        return output_path
+
+    def export_all(self, prefix: str = "") -> Dict[str, Path]:
+        #Export all results to both JSON and CSV formats.
+
+        paths = {}
+        if self.classifications:
+            paths["classifications_json"] = self.export_classifications_json(
+                f"{prefix}classifications.json" if prefix else "classifications.json"
+            )
+            paths["classifications_csv"] = self.export_classifications_csv(
+                f"{prefix}classifications.csv" if prefix else "classifications.csv"
+            )
+        if self.extractions:
+            paths["extractions_json"] = self.export_extractions_json(
+                f"{prefix}extractions.json" if prefix else "extractions.json"
+            )
+            paths["extractions_csv"] = self.export_extractions_csv(
+                f"{prefix}extractions.csv" if prefix else "extractions.csv"
+            )
+        return paths
+
+    def get_summary(self) -> Dict[str, Any]:
+        #Get a summary of all collected results.
+        return {
+            "total_classifications": len(self.classifications),
+            "useful_count": sum(1 for c in self.classifications if c.classification == "useful"),
+            "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"),
+            "total_extractions": len(self.extractions),
+            "successful_extractions": sum(1 for e in self.extractions if e.error is None),
+            "average_classification_confidence": (
+                sum(c.confidence for c in self.classifications) / len(self.classifications)
+                if self.classifications
+                else 0.0
+            ),
+        }
+
+
+# Convenience functions for simple use cases
+def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path:
+    #Export a list of result dictionaries to JSON.
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2)
+    return path
+
+
+def export_to_csv(results: List[Dict[str, Any]], output_path: str) -> Path:
+    #Export a list of result dictionaries to CSV.
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if not results:
+        # Create empty file
+        path.touch()
+        return path
+
+    fieldnames = list(results[0].keys())
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+    return path
+
+
+if __name__ == "__main__":
+    # Example usage demonstration
+    print("=== FracFeedExtractor Structured Output ===\n")
+
+    # Create sample classification results
+    class_results = [
+        ClassificationResult(
+            filename="Adams_1989.pdf",
+            classification="useful",
+            confidence=0.92,
+            text_length=45000,
+        ),
+        ClassificationResult(
+            filename="Rosalino_2009.pdf",
+            classification="not-useful",
+            confidence=0.85,
+            text_length=32000,
+        ),
+    ]
+
+    # Create sample extraction result
+    extraction = ExtractionResult(
+        filename="Adams_1989.pdf",
+        predator_species="Pygoscelis papua",
+        predator_common_name="Gentoo Penguin",
+        survey_location="Marion Island, sub-Antarctic",
+        survey_latitude=-46.88,
+        survey_longitude=37.90,
+        survey_year=1984,
+        total_stomachs_examined=144,
+        empty_stomachs=12,
+        non_empty_stomachs=132,
+        extraction_confidence=0.88,
+        source_text_snippet="A total of 144 stomach samples was collected...",
+    )
+
+    # Use OutputManager
+    manager = OutputManager(output_dir="data/results")
+
+    for result in class_results:
+        manager.add_classification(result)
+
+    manager.add_extraction(extraction)
+
+    # Export all results
+    paths = manager.export_all()
+
+    print("\nExported files:")
+    for name, path in paths.items():
+        print(f"  {name}: {path}")
+
+    print("\nSummary:")
+    summary = manager.get_summary()
+    for key, value in summary.items():
+        print(f"  {key}: {value}")
diff --git a/tests/test_structured_output.py b/tests/test_structured_output.py
new file mode 100644
index 0000000..d6618da
--- /dev/null
+++ b/tests/test_structured_output.py
@@ -0,0 +1,319 @@
+"""Tests for the structured output module."""
+
+import pytest
+import json
+import csv
+from pathlib import Path
+from src.output.structured_output import (
+    ClassificationResult,
+    ExtractionResult,
+    PipelineResult,
+    OutputManager,
+    export_to_json,
+    export_to_csv,
+)
+
+
+class TestClassificationResult:
+    #Tests for ClassificationResult dataclass
+
+    def test_create_basic_result(self):
+        result = ClassificationResult(
+            filename="test.pdf",
+            classification="useful",
+            confidence=0.85,
+        )
+        assert result.filename == "test.pdf"
+        assert result.classification == "useful"
+        assert result.confidence == 0.85
+        assert result.model_version == "1.0.0"
+
+    def test_to_dict(self):
+        result = ClassificationResult(
+            filename="test.pdf",
+            classification="useful",
+            confidence=0.85,
+        )
+        d = result.to_dict()
+        assert d["filename"] == "test.pdf"
+        assert d["classification"] == "useful"
+        assert d["confidence"] == 0.85
+        assert "timestamp" in d
+
+    def test_with_error(self):
+        result = ClassificationResult(
+            filename="bad.pdf",
+            classification="unknown",
+            confidence=0.0,
+            error="Failed to extract text",
+        )
+        assert result.error == "Failed to extract text"
+
+
+class TestExtractionResult:
+    #Tests for ExtractionResult dataclass
+
+    def test_create_basic_result(self):
+        result = ExtractionResult(
+            filename="test.pdf",
+            predator_species="Canis lupus",
+            survey_year=2020,
+        )
+        assert result.filename == "test.pdf"
+        assert result.predator_species == "Canis lupus"
+        assert result.survey_year == 2020
+
+    def test_fraction_feeding_auto_calculation(self):
+        result = ExtractionResult(
+            filename="test.pdf",
+            total_stomachs_examined=100,
+            non_empty_stomachs=75,
+        )
+        assert result.fraction_feeding == 0.75
+
+    def test_fraction_feeding_not_calculated_when_missing_data(self):
+        result = ExtractionResult(
+            filename="test.pdf",
+            total_stomachs_examined=100,
+            # non_empty_stomachs not provided
+        )
+        assert result.fraction_feeding is None
+
+    def test_fraction_feeding_not_overwritten(self):
+        result = ExtractionResult(
+            filename="test.pdf",
+            total_stomachs_examined=100,
+            non_empty_stomachs=75,
+            fraction_feeding=0.80,  # Manually set
+        )
+        assert result.fraction_feeding == 0.80  # Should not be overwritten
+
+    def test_to_dict(self):
+        result = ExtractionResult(
+            filename="test.pdf",
+            predator_species="Canis lupus",
+            survey_location="Yellowstone",
+            survey_year=2020,
+            total_stomachs_examined=50,
+            empty_stomachs=10,
+            non_empty_stomachs=40,
+        )
+        d = result.to_dict()
+        assert d["predator_species"] == "Canis lupus"
+        assert d["survey_location"] == "Yellowstone"
+        assert d["fraction_feeding"] == 0.80
+
+
+class TestPipelineResult:
+    #Tests for PipelineResult dataclass
+
+    def test_create_useful_result(self):
+        classification = ClassificationResult(
+            filename="test.pdf",
+            classification="useful",
+            confidence=0.9,
+        )
+        extraction = ExtractionResult(
+            filename="test.pdf",
+            predator_species="Canis lupus",
+        )
+        pipeline = PipelineResult(
+            filename="test.pdf",
+            classification=classification,
+            extraction=extraction,
+        )
+        assert pipeline.extraction is not None
+
+    def test_create_not_useful_result(self):
+        classification = ClassificationResult(
+            filename="test.pdf",
+            classification="not-useful",
+            confidence=0.85,
+        )
+        pipeline = PipelineResult(
+            filename="test.pdf",
+            classification=classification,
+            extraction=None,
+        )
+        assert pipeline.extraction is None
+
+    def test_to_dict(self):
+        classification = ClassificationResult(
+            filename="test.pdf",
+            classification="useful",
+            confidence=0.9,
+        )
+        pipeline = PipelineResult(
+            filename="test.pdf",
+            classification=classification,
+        )
+        d = pipeline.to_dict()
+        assert "classification" in d
+        assert d["classification"]["confidence"] == 0.9
+
+
+class TestOutputManager:
+    #Tests for OutputManager class
+
+    @pytest.fixture
+    def output_dir(self, tmp_path):
+        return tmp_path / "results"
+
+    @pytest.fixture
+    def manager(self, output_dir):
+        return OutputManager(output_dir=str(output_dir))
+
+    @pytest.fixture
+    def sample_classifications(self):
+        return [
+            ClassificationResult(filename="a.pdf", classification="useful", confidence=0.9),
+            ClassificationResult(filename="b.pdf", classification="not-useful", confidence=0.8),
+            ClassificationResult(filename="c.pdf", classification="useful", confidence=0.95),
+        ]
+
+    @pytest.fixture
+    def sample_extractions(self):
+        return [
+            ExtractionResult(
+                filename="a.pdf",
+                predator_species="Species A",
+                total_stomachs_examined=100,
+                non_empty_stomachs=80,
+            ),
+            ExtractionResult(
+                filename="c.pdf",
+                predator_species="Species C",
+                total_stomachs_examined=50,
+                non_empty_stomachs=40,
+            ),
+        ]
+
+    def test_creates_output_directory(self, output_dir):
+        OutputManager(output_dir=str(output_dir))
+        assert output_dir.exists()
+
+    def test_add_classification(self, manager, sample_classifications):
+        for c in sample_classifications:
+            manager.add_classification(c)
+        assert len(manager.classifications) == 3
+
+    def test_add_extraction(self, manager, sample_extractions):
+        for e in sample_extractions:
+            manager.add_extraction(e)
+        assert len(manager.extractions) == 2
+
+    def test_export_classifications_json(self, manager, sample_classifications, output_dir):
+        for c in sample_classifications:
+            manager.add_classification(c)
+
+        path = manager.export_classifications_json()
+        assert path.exists()
+
+        with open(path) as f:
+            data = json.load(f)
+
+        assert "metadata" in data
+        assert data["metadata"]["total_files"] == 3
+        assert data["metadata"]["useful_count"] == 2
+        assert len(data["results"]) == 3
+
+    def test_export_classifications_csv(self, manager, sample_classifications, output_dir):
+        for c in sample_classifications:
+            manager.add_classification(c)
+
+        path = manager.export_classifications_csv()
+        assert path.exists()
+
+        with open(path, newline="") as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 3
+        assert rows[0]["filename"] == "a.pdf"
+
+    def test_export_extractions_json(self, manager, sample_extractions, output_dir):
+        for e in sample_extractions:
+            manager.add_extraction(e)
+
+        path = manager.export_extractions_json()
+        assert path.exists()
+
+        with open(path) as f:
+            data = json.load(f)
+
+        assert len(data["results"]) == 2
+        assert data["results"][0]["predator_species"] == "Species A"
+
+    def test_export_extractions_csv(self, manager, sample_extractions, output_dir):
+        for e in sample_extractions:
+            manager.add_extraction(e)
+
+        path = manager.export_extractions_csv()
+        assert path.exists()
+
+        with open(path, newline="") as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 2
+
+    def test_export_all(self, manager, sample_classifications, sample_extractions, output_dir):
+        for c in sample_classifications:
+            manager.add_classification(c)
+        for e in sample_extractions:
+            manager.add_extraction(e)
+
+        paths = manager.export_all()
+
+        assert "classifications_json" in paths
+        assert "classifications_csv" in paths
+        assert "extractions_json" in paths
+        assert "extractions_csv" in paths
+
+        for path in paths.values():
+            assert path.exists()
+
+    def test_get_summary(self, manager, sample_classifications, sample_extractions):
+        for c in sample_classifications:
+            manager.add_classification(c)
+        for e in sample_extractions:
+            manager.add_extraction(e)
+
+        summary = manager.get_summary()
+
+        assert summary["total_classifications"] == 3
+        assert summary["useful_count"] == 2
+        assert summary["not_useful_count"] == 1
+        assert summary["total_extractions"] == 2
+
+
+class TestConvenienceFunctions:
+    #Tests for standalone export functions.
+
+    def test_export_to_json(self, tmp_path):
+        results = [{"name": "test", "value": 123}]
+        path = export_to_json(results, str(tmp_path / "test.json"))
+
+        assert path.exists()
+        with open(path) as f:
+            data = json.load(f)
+        assert data == results
+
+    def test_export_to_csv(self, tmp_path):
+        results = [
+            {"name": "a", "value": 1},
+            {"name": "b", "value": 2},
+        ]
+        path = export_to_csv(results, str(tmp_path / "test.csv"))
+
+        assert path.exists()
+        with open(path, newline="") as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 2
+        assert rows[0]["name"] == "a"
+
+    def test_export_empty_csv(self, tmp_path):
+        path = export_to_csv([], str(tmp_path / "empty.csv"))
+        assert path.exists()

From 9e27236d12826f95c2cc6eeb80cd5df9e62277e1 Mon Sep 17 00:00:00 2001
From: sillygoose <zahraa.ahmed2410@gmail.com>
Date: Tue, 2 Dec 2025 17:00:08 -0800
Subject: [PATCH 2/2] fixed format

---
 src/model/pdf_classifier.py     | 27 +++++++------
 src/output/structured_output.py | 71 +++++++++++++--------------------
 tests/test_structured_output.py | 10 ++---
 3 files changed, 46 insertions(+), 62 deletions(-)

diff --git a/src/model/pdf_classifier.py b/src/model/pdf_classifier.py
index e4c661a..1202712 100644
--- a/src/model/pdf_classifier.py
+++ b/src/model/pdf_classifier.py
@@ -16,6 +16,7 @@
 # Try to import structured output module
 try:
     from src.output.structured_output import ClassificationResult, OutputManager
+
     STRUCTURED_OUTPUT_AVAILABLE = True
 except ImportError as e:
     print(f"[WARNING] Could not import structured_output: {e}")
@@ -25,8 +26,8 @@
 
 
 def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False):
-    #Classify a single PDF as useful or not useful.
-   
+    # Classify a single PDF as useful or not useful.
+
     start_time = time.time()
     model_path = Path(model_dir) / "pdf_classifier.json"
     vectorizer_path = Path(model_dir) / "tfidf_vectorizer.pkl"
@@ -98,36 +99,36 @@ def classify_pdf(pdf_path, model_dir="src/model/models", return_result=False):
 
 
 def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/results"):
-    #Classify all PDFs in a folder and export results.
-    
+    # Classify all PDFs in a folder and export results.
+
     if not STRUCTURED_OUTPUT_AVAILABLE:
         print("[ERROR] Structured output module not available.")
         print("Make sure src/output/structured_output.py exists.")
         return {}
-    
+
     folder = Path(folder_path)
     if not folder.exists():
         print(f"[ERROR] Folder not found: {folder_path}")
         return {}
-    
+
     pdf_files = list(folder.glob("*.pdf"))
     if not pdf_files:
         print(f"[WARN] No PDF files found in {folder_path}")
         return {}
-    
+
     print(f"Found {len(pdf_files)} PDF files to classify.")
-    
+
     manager = OutputManager(output_dir=output_dir)
-    
+
     for i, pdf_path in enumerate(pdf_files, 1):
         print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_path.name}")
         result = classify_pdf(str(pdf_path), model_dir=model_dir, return_result=True)
         if result:
             manager.add_classification(result)
-    
+
     # Export results
     paths = manager.export_all()
-    
+
     # Print summary
     print("\n=== Classification Summary ===")
     summary = manager.get_summary()
@@ -136,7 +137,7 @@ def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/
     print(f" Not useful: {summary['not_useful_count']}")
     print(f" Avg confidence: {summary['average_classification_confidence']:.2%}")
     print("==============================\n")
-    
+
     return paths
 
 
@@ -157,4 +158,4 @@ def classify_folder(folder_path, model_dir="src/model/models", output_dir="data/
     elif args.pdf_path:
         classify_pdf(args.pdf_path, args.model_dir)
     else:
-        parser.print_help()
\ No newline at end of file
+        parser.print_help()
diff --git a/src/output/structured_output.py b/src/output/structured_output.py
index dfeca0e..5f63c96 100644
--- a/src/output/structured_output.py
+++ b/src/output/structured_output.py
@@ -1,4 +1,4 @@
-"""Structured Output Module 
+"""Structured Output Module
 
 This module handles the export of classification and data extraction results
 to JSON and CSV formats with clear provenance and uncertainty tracking.
@@ -32,7 +32,7 @@
 
 @dataclass
 class ClassificationResult:
-    #Stores the result of classifying a single PDF.
+    # Stores the result of classifying a single PDF.
 
     filename: str
     classification: str
@@ -44,13 +44,13 @@ class ClassificationResult:
     error: Optional[str] = None
 
     def to_dict(self) -> Dict[str, Any]:
-        #Convert to dictionary for JSON serialization
+        # Convert to dictionary for JSON serialization
         return asdict(self)
 
 
 @dataclass
 class ExtractionResult:
-    #Stores extracted data from a 'useful' PDF
+    # Stores extracted data from a 'useful' PDF
     filename: str
     predator_species: Optional[str] = None
     predator_common_name: Optional[str] = None
@@ -72,30 +72,25 @@ class ExtractionResult:
     error: Optional[str] = None
 
     def __post_init__(self):
-        #Calculate fraction_feeding if stomach counts are available
-        if (
-            self.fraction_feeding is None
-            and self.total_stomachs_examined is not None
-            and self.non_empty_stomachs is not None
-            and self.total_stomachs_examined > 0
-        ):
+        # Calculate fraction_feeding if stomach counts are available
+        if self.fraction_feeding is None and self.total_stomachs_examined is not None and self.non_empty_stomachs is not None and self.total_stomachs_examined > 0:
             self.fraction_feeding = self.non_empty_stomachs / self.total_stomachs_examined
 
     def to_dict(self) -> Dict[str, Any]:
-        #Convert to dictionary for JSON serialization
+        # Convert to dictionary for JSON serialization
         return asdict(self)
 
 
 @dataclass
 class PipelineResult:
-    #Combined result from the full pipeline (classification and extraction).
+    # Combined result from the full pipeline (classification and extraction).
 
     filename: str
     classification: ClassificationResult
     extraction: Optional[ExtractionResult] = None
 
     def to_dict(self) -> Dict[str, Any]:
-        #Convert to dictionary for JSON serialization
+        # Convert to dictionary for JSON serialization
         result = {
             "filename": self.filename,
             "classification": self.classification.to_dict(),
@@ -106,10 +101,10 @@ def to_dict(self) -> Dict[str, Any]:
 
 
 class OutputManager:
-    #Manages collection and export of pipeline results.
+    # Manages collection and export of pipeline results.
 
     def __init__(self, output_dir: str = "data/results"):
-        #Initialize the OutputManager.
+        # Initialize the OutputManager.
 
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
@@ -118,22 +113,22 @@ def __init__(self, output_dir: str = "data/results"):
         self.pipeline_results: List[PipelineResult] = []
 
     def add_classification(self, result: ClassificationResult) -> None:
-        #Add a classification result to the collection
+        # Add a classification result to the collection
         self.classifications.append(result)
 
     def add_extraction(self, result: ExtractionResult) -> None:
-        #Add an extraction result to the collection.
+        # Add an extraction result to the collection.
         self.extractions.append(result)
 
     def add_pipeline_result(self, result: PipelineResult) -> None:
-        #Add a complete pipeline result to the collection
+        # Add a complete pipeline result to the collection
         self.pipeline_results.append(result)
         self.classifications.append(result.classification)
         if result.extraction:
             self.extractions.append(result.extraction)
 
     def export_classifications_json(self, filename: str = "classifications.json") -> Path:
-        #Export classification results to JSON.
+        # Export classification results to JSON.
         output_path = self.output_dir / filename
         data = {
             "metadata": {
@@ -150,7 +145,7 @@ def export_classifications_json(self, filename: str = "classifications.json") ->
         return output_path
 
     def export_classifications_csv(self, filename: str = "classifications.csv") -> Path:
-        #Export classification results to CSV.
+        # Export classification results to CSV.
         output_path = self.output_dir / filename
         if not self.classifications:
             print("[WARN] No classifications to export.")
@@ -166,7 +161,7 @@ def export_classifications_csv(self, filename: str = "classifications.csv") -> P
         return output_path
 
     def export_extractions_json(self, filename: str = "extractions.json") -> Path:
-        #Export extraction results to JSON.
+        # Export extraction results to JSON.
         output_path = self.output_dir / filename
         data = {
             "metadata": {
@@ -182,7 +177,7 @@ def export_extractions_json(self, filename: str = "extractions.json") -> Path:
         return output_path
 
     def export_extractions_csv(self, filename: str = "extractions.csv") -> Path:
-        #Export extraction results to CSV.
+        # Export extraction results to CSV.
         output_path = self.output_dir / filename
         if not self.extractions:
             print("[WARN] No extractions to export.")
@@ -198,44 +193,32 @@ def export_extractions_csv(self, filename: str = "extractions.csv") -> Path:
         return output_path
 
     def export_all(self, prefix: str = "") -> Dict[str, Path]:
-        #Export all results to both JSON and CSV formats.
+        # Export all results to both JSON and CSV formats.
 
         paths = {}
         if self.classifications:
-            paths["classifications_json"] = self.export_classifications_json(
-                f"{prefix}classifications.json" if prefix else "classifications.json"
-            )
-            paths["classifications_csv"] = self.export_classifications_csv(
-                f"{prefix}classifications.csv" if prefix else "classifications.csv"
-            )
+            paths["classifications_json"] = self.export_classifications_json(f"{prefix}classifications.json" if prefix else "classifications.json")
+            paths["classifications_csv"] = self.export_classifications_csv(f"{prefix}classifications.csv" if prefix else "classifications.csv")
         if self.extractions:
-            paths["extractions_json"] = self.export_extractions_json(
-                f"{prefix}extractions.json" if prefix else "extractions.json"
-            )
-            paths["extractions_csv"] = self.export_extractions_csv(
-                f"{prefix}extractions.csv" if prefix else "extractions.csv"
-            )
+            paths["extractions_json"] = self.export_extractions_json(f"{prefix}extractions.json" if prefix else "extractions.json")
+            paths["extractions_csv"] = self.export_extractions_csv(f"{prefix}extractions.csv" if prefix else "extractions.csv")
         return paths
 
     def get_summary(self) -> Dict[str, Any]:
-        #Get a summary of all collected results.
+        # Get a summary of all collected results.
         return {
             "total_classifications": len(self.classifications),
             "useful_count": sum(1 for c in self.classifications if c.classification == "useful"),
             "not_useful_count": sum(1 for c in self.classifications if c.classification == "not-useful"),
             "total_extractions": len(self.extractions),
             "successful_extractions": sum(1 for e in self.extractions if e.error is None),
-            "average_classification_confidence": (
-                sum(c.confidence for c in self.classifications) / len(self.classifications)
-                if self.classifications
-                else 0.0
-            ),
+            "average_classification_confidence": (sum(c.confidence for c in self.classifications) / len(self.classifications) if self.classifications else 0.0),
         }
 
 
 # Convenience functions for simple use cases
 def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path:
-    #Export a list of result dictionaries to JSON.
+    # Export a list of result dictionaries to JSON.
     path = Path(output_path)
     path.parent.mkdir(parents=True, exist_ok=True)
     with open(path, "w", encoding="utf-8") as f:
@@ -244,7 +227,7 @@ def export_to_json(results: List[Dict[str, Any]], output_path: str) -> Path:
 
 
 def export_to_csv(results: List[Dict[str, Any]], output_path: str) -> Path:
-    #Export a list of result dictionaries to CSV.
+    # Export a list of result dictionaries to CSV.
     path = Path(output_path)
     path.parent.mkdir(parents=True, exist_ok=True)
     if not results:
diff --git a/tests/test_structured_output.py b/tests/test_structured_output.py
index d6618da..56c9def 100644
--- a/tests/test_structured_output.py
+++ b/tests/test_structured_output.py
@@ -15,7 +15,7 @@
 
 
 class TestClassificationResult:
-    #Tests for ClassificationResult dataclass
+    # Tests for ClassificationResult dataclass
 
     def test_create_basic_result(self):
         result = ClassificationResult(
@@ -51,7 +51,7 @@ def test_with_error(self):
 
 
 class TestExtractionResult:
-    #Tests for ExtractionResult dataclass
+    # Tests for ExtractionResult dataclass
 
     def test_create_basic_result(self):
         result = ExtractionResult(
@@ -105,7 +105,7 @@ def test_to_dict(self):
 
 
 class TestPipelineResult:
-    #Tests for PipelineResult dataclass
+    # Tests for PipelineResult dataclass
 
     def test_create_useful_result(self):
         classification = ClassificationResult(
@@ -153,7 +153,7 @@ def test_to_dict(self):
 
 
 class TestOutputManager:
-    #Tests for OutputManager class
+    # Tests for OutputManager class
 
     @pytest.fixture
     def output_dir(self, tmp_path):
@@ -288,7 +288,7 @@ def test_get_summary(self, manager, sample_classifications, sample_extractions):
 
 
 class TestConvenienceFunctions:
-    #Tests for standalone export functions.
+    # Tests for standalone export functions.
 
     def test_export_to_json(self, tmp_path):
         results = [{"name": "test", "value": 123}]