From 10ef3d3fed3793856a6c60ca271e11009ac02f72 Mon Sep 17 00:00:00 2001
From: Priyanka Pudi <priyankapudi4u@gmail.com>
Date: Thu, 9 Oct 2025 16:03:43 +0530
Subject: [PATCH 1/4] Fixed the path issue that is happening when using dump-id
 for wiki-data

---
 src/scribe_data/wikipedia/generate_autosuggestions.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py
index 622d448cf..ecd6744b0 100644
--- a/src/scribe_data/wikipedia/generate_autosuggestions.py
+++ b/src/scribe_data/wikipedia/generate_autosuggestions.py
@@ -61,10 +61,16 @@ def generate_autosuggestions(language, dump_id, force_download):
     output_path = f"./{language_abbr}wiki.ndjson"
     if dump_id:
         output_path = f"./{language_abbr}wiki-{dump_id}.ndjson"
+
+    # Added for consistency , make dir name similar name as output_path
+    partitions_dir = f"./{language_abbr}wiki_partitions"
+    if dump_id:
+        partitions_dir = f"./{language_abbr}wiki-{dump_id}_partitions"
+
     parse_to_ndjson(
         output_path=output_path,
         input_dir=target_dir,
-        partitions_dir=f"./{language_abbr}wiki_partitions",
+        partitions_dir=partitions_dir,
         article_limit=None,
         delete_parsed_files=True,
         force_download=force_download,
@@ -72,7 +78,8 @@ def generate_autosuggestions(language, dump_id, force_download):
         verbose=True,
     )
 
-    with open(f"./{language_abbr}wiki.ndjson", "r") as fin:
+    # This should be "output_path" because when dump_id is given as arg , it looks for right path to make partitions_dir
+    with open(output_path, "r") as fin:
         article_texts = [
             json.loads(lang)[1]
             for lang in tqdm(fin, desc="Articles added", unit="articles")

From d1e8ae799319d1c54c34577b5465f835fe04bfaa Mon Sep 17 00:00:00 2001
From: Priyanka Pudi <priyankapudi4u@gmail.com>
Date: Thu, 9 Oct 2025 16:30:08 +0530
Subject: [PATCH 2/4] revert: keep partitions_dir hardcoded for consistency
 with tests

---
 src/scribe_data/wikipedia/generate_autosuggestions.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py
index ecd6744b0..bceec4cee 100644
--- a/src/scribe_data/wikipedia/generate_autosuggestions.py
+++ b/src/scribe_data/wikipedia/generate_autosuggestions.py
@@ -62,15 +62,10 @@ def generate_autosuggestions(language, dump_id, force_download):
     if dump_id:
         output_path = f"./{language_abbr}wiki-{dump_id}.ndjson"
 
-    # Added for consistency , make dir name similar name as output_path
-    partitions_dir = f"./{language_abbr}wiki_partitions"
-    if dump_id:
-        partitions_dir = f"./{language_abbr}wiki-{dump_id}_partitions"
-
     parse_to_ndjson(
         output_path=output_path,
         input_dir=target_dir,
-        partitions_dir=partitions_dir,
+        partitions_dir=f"./{language_abbr}wiki_partitions",
         article_limit=None,
         delete_parsed_files=True,
         force_download=force_download,

From a856864eb6b36b5f95c6bd1c02219d0e683cb0f8 Mon Sep 17 00:00:00 2001
From: Priyanka Pudi <priyankapudi4u@gmail.com>
Date: Fri, 10 Oct 2025 15:22:46 +0530
Subject: [PATCH 3/4] fix: Completed the sax parsing for xml files

---
 src/scribe_data/wikipedia/extract_wiki.py | 65 +++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py
index e43fad6c0..d3e617444 100644
--- a/src/scribe_data/wikipedia/extract_wiki.py
+++ b/src/scribe_data/wikipedia/extract_wiki.py
@@ -463,3 +463,68 @@ def __init__(self):
         self._values = {}
         self._current_tag = None
         self.target_articles = []
+
+    # Added the missing logic here ...
+    # Need startelement, endelement and characters to extract the text inside
+    # Sax must go through lines and trigger the callbacks so define them with start, end and characters
+
+    def startElement(self, name, attrs):
+        """
+        Handle the start of an XML element.
+
+        Parameters
+        ----------
+        name : str
+            The name of the XML element being opened.
+        attrs : xml.sax.xmlreader.AttributesImpl
+            The attributes associated with the element.
+        """
+        if name in ("title", "text", "timestamp"):
+            self._current_tag = name
+            self._buffer = []
+
+    def endElement(self, name):
+        """
+        Handle the end of an XML element.
+
+        Parameters
+        ----------
+        name : str
+            The name of the XML element being closed.
+        """
+        if name == self._current_tag:
+            self._values[name] = "".join(self._buffer)
+
+        if name == "page":
+            # Process the complete page
+            title = self._values.get("title", "")
+            text = self._values.get("text", "")
+
+        # Filter out redirect pages and special pages
+        if (
+            text
+            and not text.strip().startswith("#REDIRECT")
+            and not text.strip().startswith("#redirect")
+            and ":" not in title
+        ):  # Skip namespace pages
+            processed_title, processed_text = _process_article(title, text)
+
+            if processed_text and len(processed_text) > 100:  # Minimum text length
+                self.target_articles.append([processed_title, processed_text])
+
+        # Reset values for next page, clear up for next page
+        self._values = {}
+        self._buffer = None
+        self._current_tag = None
+
+    def characters(self, content):
+        """
+        Handle character data within an XML element.
+
+        Parameters
+        ----------
+        content : str
+            The character data content from the XML element.
+        """
+        if self._current_tag:
+            self._buffer.append(content)

From 700d46e2b18b252f705fd7011403ecbe1d4c7246 Mon Sep 17 00:00:00 2001
From: Priyanka Pudi <priyankapudi4u@gmail.com>
Date: Wed, 15 Oct 2025 00:04:52 +0530
Subject: [PATCH 4/4] Task: Generated Autosuggestions For English

---
 src/scribe_data/wikipedia/extract_wiki.py     | 36 +++++++++----------
 .../wikipedia/generate_autosuggestions.py     | 12 ++++++-
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py
index d3e617444..ab6badb0f 100644
--- a/src/scribe_data/wikipedia/extract_wiki.py
+++ b/src/scribe_data/wikipedia/extract_wiki.py
@@ -464,10 +464,6 @@ def __init__(self):
         self._current_tag = None
         self.target_articles = []
 
-    # Added the missing logic here ...
-    # Need startelement, endelement and characters to extract the text inside
-    # Sax must go through lines and trigger the callbacks so define them with start, end and characters
-
     def startElement(self, name, attrs):
         """
         Handle the start of an XML element.
@@ -500,22 +496,22 @@ def endElement(self, name):
             title = self._values.get("title", "")
             text = self._values.get("text", "")
 
-        # Filter out redirect pages and special pages
-        if (
-            text
-            and not text.strip().startswith("#REDIRECT")
-            and not text.strip().startswith("#redirect")
-            and ":" not in title
-        ):  # Skip namespace pages
-            processed_title, processed_text = _process_article(title, text)
-
-            if processed_text and len(processed_text) > 100:  # Minimum text length
-                self.target_articles.append([processed_title, processed_text])
-
-        # Reset values for next page, clear up for next page
-        self._values = {}
-        self._buffer = None
-        self._current_tag = None
+            # Filter out redirect pages and special pages
+            if (
+                text
+                and not text.strip().startswith("#REDIRECT")
+                and not text.strip().startswith("#redirect")
+                and ":" not in title
+            ):  # Skip namespace pages
+                processed_title, processed_text = _process_article(title, text)
+
+                if processed_text and len(processed_text) > 100:  # Minimum text length
+                    self.target_articles.append([processed_title, processed_text])
+
+            # Reset values for next page, clear up for next page
+            self._values = {}
+            self._buffer = None
+            self._current_tag = None
 
     def characters(self, content):
         """
diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py
index 5f5b9520c..71b2e626a 100644
--- a/src/scribe_data/wikipedia/generate_autosuggestions.py
+++ b/src/scribe_data/wikipedia/generate_autosuggestions.py
@@ -73,7 +73,7 @@ def generate_autosuggestions(language, dump_id, force_download):
         verbose=True,
     )
 
-    with open(output_path, "r") as fin:
+    with open(output_path, "r", encoding="utf-8") as fin:
         article_texts = [
             json.loads(lang)[1]
             for lang in tqdm(fin, desc="Articles added", unit="articles")
@@ -101,3 +101,13 @@ def generate_autosuggestions(language, dump_id, force_download):
         update_local_data=True,
         verbose=True,
     )
+
+
+# Uncomment to test
+"""if __name__ == "__main__":
+    generate_autosuggestions(
+        language="english",
+        dump_id="20250520",
+        force_download=False,
+        file_limit=1  # limiting test with just 1 file
+    )"""