Modalities · ajude2s · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/configs/annotation/lorem_ipsum_annotation_pipeline_slurm.yaml b/configs/annotation/lorem_ipsum_annotation_pipeline_slurm.yaml
@@ -15,7 +15,7 @@ slurm_settings:
   sbatch_args:
     account: "p_gptx"
     nodes: 1
-    ntasks: 1
+    ntasks: 1 # i think its better to use nodes_per_task right under slurm settings and remove this line
     gres: gpu:1
   partition: "capella"
   time: "04:00:00"

diff --git a/configs/data_mixes/filtering/threshold_filter_pipeline.yaml b/configs/data_mixes/filtering/threshold_filter_pipeline.yaml
@@ -0,0 +1,96 @@
+# Threshold filter pipeline config (builder-style)
+# Used by: ml_filter.data_processing.jsonl_filtering.threshold_filter_pipeline.run_threshold_filter_pipeline
+
+running_on_slurm: false
+
+params:
+  # Input JSONL locations (paired mode)
+  text_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/ml_filter/src/ml_filter/data/text_jsonl
+  scores_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/ml_filter/src/ml_filter/data/score_jsonl
+  domains_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/ml_filter/src/ml_filter/data/domains_jsonl
+  paths_file: /raid/s3/opengptx/jude/repos/ml_filter/ml_filter/src/ml_filter/data/score_jsonl/dummy_paths.txt
+  glob_pattern: "**/*.jsonl"   # optional
+  recursive: true
+  compression: infer           # infer | gzip | zstd | null
+
+  # Filtering
+  # - score_keys: keys we read + require to exist on each line
+  # - thresholds_by_score_key: only these keys are actually used for filtering
+  score_keys:
+    - score_Gemma_Snowflake
+    - score_Llama_Snowflake
+  thresholds_by_score_key:
+    score_Gemma_Snowflake: 0.0
+    score_Llama_Snowflake: 0.0
+  # Optional per-folder overrides (top-level folder names)
+  thresholds_by_folder:
+    Deu_Latn:
+      score_Gemma_Snowflake: 1.5
+      score_Llama_Snowflake: 1.5
+    Fra_Latn:
+      score_Gemma_Snowflake: 1.6
+      score_Llama_Snowflake: 1.6
+    Ita_Latn:
+      score_Gemma_Snowflake: 1.4
+      score_Llama_Snowflake: 1.4
+    Spa_Latn:
+      score_Gemma_Snowflake: 1.7
+      score_Llama_Snowflake: 1.7
+
+  # Document field names in the JSONL
+  text_jsonl_id_key: document_id
+  score_jsonl_id_key: document_id
+  text_jsonl_text_key: text
+  domain_jsonl_id_key: document_id
+  domain_jsonl_domain_key: domain
+  accepted_domains:
+    - wikipedia.org
+    - stackexchange.com
+
+  # Paired alignment error handling
+  on_mismatch: raise  # raise | skip_line | skip_file
+  max_mismatches_per_file: 0
+
+  # Optional word-count filter
+  min_num_words: null
+  num_words_column: text
+
+  # Output
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/ml_filter/outputs/threshold_filter_pipeline_local_dummy
+  output_filename: "${file_relpath}"
+
+# ------------------------------------------------------------
+# Execution settings: choose ONE block depending on running_on_slurm
+# ------------------------------------------------------------
+
+local_settings:
+  tasks: 1
+  local_tasks: 1
+  local_rank_offset: 0
+  workers: -1
+  logging_dir: null
+
+# slurm_settings:
+#   tasks: 1
+#   time: "00:30:00"
+#   partition: "default"
+#   cpus_per_task: 4
+#   mem_per_cpu_gb: 8
+#   workers: -1
+#   job_name: "threshold_filter_pipeline"
+#   qos: "normal"
+#   env_command: null
+#   condaenv: null
+#   venv_path: null
+#   sbatch_args: null
+#   max_array_size: 1001
+#   depends_job_id: null
+#   job_id_position: -1
+#   logging_dir: null
+#   skip_completed: true
+#   slurm_logs_folder: null
+#   mail_type: "ALL"
+#   mail_user: null
+#   requeue: true
+#   srun_args: null
+#   tasks_per_job: 1
diff --git a/configs/data_mixes/filtering/threshold_filter_pipeline_local_dummy.yaml b/configs/data_mixes/filtering/threshold_filter_pipeline_local_dummy.yaml
@@ -0,0 +1,58 @@
+running_on_slurm: false
+
+params:
+  text_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/soofi_filtering/filters/data/text_jsonl
+  scores_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/soofi_filtering/filters/data/score_jsonl
+  domains_input_dir: /raid/s3/opengptx/jude/repos/ml_filter/soofi_filtering/filters/data/domain_jsonl
+  glob_pattern: null
+  recursive: true
+  compression: null
+
+  # --- Filtering ---
+  score_keys:
+    - score_Gemma_Snowflake
+    - score_Llama_Snowflake
+  thresholds_by_score_key:
+    score_Gemma_Snowflake: 0.0
+    score_Llama_Snowflake: 0.0
+  thresholds_by_folder:
+    Deu_Latn:
+      score_Gemma_Snowflake: 1.5
+      score_Llama_Snowflake: 1.5
+    Fra_Latn:
+      score_Gemma_Snowflake: 1.6
+      score_Llama_Snowflake: 1.6
+    Ita_Latn:
+      score_Gemma_Snowflake: 1.4
+      score_Llama_Snowflake: 1.4
+    Spa_Latn:
+      score_Gemma_Snowflake: 1.7
+      score_Llama_Snowflake: 1.7
+
+  text_jsonl_id_key: id
+  score_jsonl_id_key: id
+  text_jsonl_text_key: text
+  domain_jsonl_id_key: id
+  domain_jsonl_domain_key: domain
+  accepted_domains:
+    - wikipedia.org
+    - stackexchange.com
+
+  # --- Optional: word-count filter ---
+  min_num_words: null
+  num_words_column: text
+
+  # --- Paired alignment error handling ---
+  on_mismatch: raise
+  max_mismatches_per_file: 0
+
+  # --- Output ---
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/soofi_filtering/filters/output/threshold_filter_pipeline_local_dummy
+  output_filename: "${file_relpath}"
+
+local_settings:
+  tasks: 1
+  local_tasks: 1
+  local_rank_offset: 0
+  workers: -1
+  logging_dir: null
diff --git a/configs/data_mixes/quantile_calculation/lorem_ipsum_quantile_pipeline.yaml b/configs/data_mixes/quantile_calculation/lorem_ipsum_quantile_pipeline.yaml
@@ -0,0 +1,20 @@
+params:
+  input_dir: /raid/s3/opengptx/akhan/ml_filter/data/dummy_quantile_data
+  glob_pattern: "**/*.jsonl"
+  output_dir: /raid/s3/opengptx/akhan/ml_filter/outputs
+  compression: null
+  output_compression: null
+  score_fields: ["score_llama", "score_mistral", "score_gemma"]
+  selection_quantile: 0.80
+  report_filename: quantile_report.yaml
+  quantile_data_dir: quantile_data
+
+running_on_slurm: false
+
+local_settings:
+  tasks: 1  # world_size (number of ranks / shards)
+  local_tasks: 1
+  local_rank_offset: 0
+  workers: 1
+
+slurm_settings: null
diff --git a/configs/data_mixes/quantile_calculation/lorem_ipsum_quantile_pipeline_slurm.yaml b/configs/data_mixes/quantile_calculation/lorem_ipsum_quantile_pipeline_slurm.yaml
@@ -0,0 +1,30 @@
+params:
+  input_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/output/annotations/annotated_data
+  glob_pattern: "**/*.jsonl"
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/output/quantiles
+  compression: null
+  output_compression: null
+  score_fields: ["score_llama", "score_mistral", "score_gemma"]
+  selection_quantile: 0.9
+  report_filename: quantile_report.yaml
+  quantile_data_dir: quantile_data
+
+running_on_slurm: true
+
+local_settings: null
+
+slurm_settings:
+  sbatch_args:
+    account: "p_gptx"
+    nodes: 1
+    ntasks: 1 # i think its better to use nodes_per_task right under slurm settings and remove this line
+    gres: gpu:1
+  partition: "capella"
+  time: "00:30:00"
+  cpus_per_task: 4
+  mem_per_cpu_gb: 8
+  job_name: "quantile_pipeline"
+  qos: "normal"
+  venv_path: /data/cat/ws/alju972f-regression_heads/repos/env/jql_pipeline/bin/activate
+  tasks: 1
+  workers: 1
diff --git a/configs/data_processing/xlm_roberta_tokenize.yaml b/configs/data_processing/xlm_roberta_tokenize.yaml
diff --git a/documentation/pipelines.md b/documentation/pipelines.md
@@ -1,4 +1,4 @@
-# Embedding & Annotation Pipelines
+# Embedding, Annotation, and Ablation Pipelines
 
 This document explains how to generate model embeddings for large JSONL corpora and then run regression / classification heads to obtain annotation scores at scale using MLFilter's Datatrove-based pipelines.
 
@@ -37,7 +37,7 @@ Notes:
 
 ## Overview
 
-The workflow consists of two sequential pipelines:
+The workflow consists of two sequential pipelines plus optional ablation runs:
 
 1. Embedding Pipeline (`run_embedding_pipeline`)  
    Reads raw JSONL documents, tokenizes & feeds them through an embedding model, and stores embeddings (optionally with labels) into per-source HDF5 files.
@@ -198,6 +198,47 @@ Per embedding source file: `${source_filename}.jsonl` written to:
 ```
 Each line contains original metadata (from `output_keys`) plus head outputs (scores / predictions).
 
+---
+## Quantile Ablation Pipeline
+
+Computes per-language JSONL score quantiles using per-row averaged scores and emits a YAML report.
+
+### YAML Schema (`QuantilePipelineParameters`)
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `input_dir` | str | Directory containing JSONL files. |
+| `glob_pattern` | str | Glob selecting which JSONL files to process (e.g. `*.jsonl`). |
+| `output_dir` | path | Base output directory. |
+| `compression` | str/None | Compression for input JSONL files (`infer`, `gzip`, `zstd`, `None`). |
+| `score_fields` | list[str] | Score fields to average per document (e.g. `["score_llama", "score_mistral"]`). |
+| `selection_quantile` | float | Top fraction to keep (e.g. `0.2` keeps top 20%). |
+| `report_filename` | str | Filename for the YAML report (default `quantile_report.yaml`). |
+
+Execution mode fields mirror other pipelines: `running_on_slurm`, `local_settings` or `slurm_settings`.
+
+### Minimal Local Example
+```yaml
+running_on_slurm: false
+params:
+  input_dir: data/jsonl
+  glob_pattern: "*.jsonl"
+  output_dir: outputs
+  compression: infer
+  output_compression: gzip
+  score_fields: ["score_llama", "score_mistral", "score_gemma"]
+  selection_quantile: 0.2
+local_settings: {}
+```
+
+### Running
+```bash
+ml_filter run_quantile_pipeline --config_file_path configs/quantile_job.yaml
+```
+
+### Outputs
+- YAML report at `<output_dir>/<report_filename>` (or per-rank when running on Slurm).
+
 ---
 ## Chaining the Pipelines
 
@@ -226,8 +267,8 @@ Each line contains original metadata (from `output_keys`) plus head outputs (sco
 ## Programmatic Usage Sketch
 ```python
 from pathlib import Path
-from ml_filter.annotation.embedding_pipeline import run_embedding_pipeline
-from ml_filter.annotation.annotation_pipeline import run_annotation_pipeline
+from ml_filter.data_pipelines.annotation.embedding_pipeline import run_embedding_pipeline
+from ml_filter.data_pipelines.annotation.annotation_pipeline import run_annotation_pipeline
 
 run_embedding_pipeline(Path("configs/embedding_job.yaml"))
 run_annotation_pipeline(Path("configs/annotation_job.yaml"))