From 10ef3d3fed3793856a6c60ca271e11009ac02f72 Mon Sep 17 00:00:00 2001 From: Priyanka Pudi Date: Thu, 9 Oct 2025 16:03:43 +0530 Subject: [PATCH 1/4] Fixed the path issue that is happening when using dump-id for wiki-data --- src/scribe_data/wikipedia/generate_autosuggestions.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py index 622d448cf..ecd6744b0 100644 --- a/src/scribe_data/wikipedia/generate_autosuggestions.py +++ b/src/scribe_data/wikipedia/generate_autosuggestions.py @@ -61,10 +61,16 @@ def generate_autosuggestions(language, dump_id, force_download): output_path = f"./{language_abbr}wiki.ndjson" if dump_id: output_path = f"./{language_abbr}wiki-{dump_id}.ndjson" + + # Added for consistency , make dir name similar name as output_path + partitions_dir = f"./{language_abbr}wiki_partitions" + if dump_id: + partitions_dir = f"./{language_abbr}wiki-{dump_id}_partitions" + parse_to_ndjson( output_path=output_path, input_dir=target_dir, - partitions_dir=f"./{language_abbr}wiki_partitions", + partitions_dir=partitions_dir, article_limit=None, delete_parsed_files=True, force_download=force_download, @@ -72,7 +78,8 @@ def generate_autosuggestions(language, dump_id, force_download): verbose=True, ) - with open(f"./{language_abbr}wiki.ndjson", "r") as fin: + # This should be "output_path" because when dump_id is given as arg , it looks for right path to make partitions_dir + with open(output_path, "r") as fin: article_texts = [ json.loads(lang)[1] for lang in tqdm(fin, desc="Articles added", unit="articles") From d1e8ae799319d1c54c34577b5465f835fe04bfaa Mon Sep 17 00:00:00 2001 From: Priyanka Pudi Date: Thu, 9 Oct 2025 16:30:08 +0530 Subject: [PATCH 2/4] revert: keep partitions_dir hardcoded for consistency with tests --- src/scribe_data/wikipedia/generate_autosuggestions.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py index ecd6744b0..bceec4cee 100644 --- a/src/scribe_data/wikipedia/generate_autosuggestions.py +++ b/src/scribe_data/wikipedia/generate_autosuggestions.py @@ -62,15 +62,10 @@ def generate_autosuggestions(language, dump_id, force_download): if dump_id: output_path = f"./{language_abbr}wiki-{dump_id}.ndjson" - # Added for consistency , make dir name similar name as output_path - partitions_dir = f"./{language_abbr}wiki_partitions" - if dump_id: - partitions_dir = f"./{language_abbr}wiki-{dump_id}_partitions" - parse_to_ndjson( output_path=output_path, input_dir=target_dir, - partitions_dir=partitions_dir, + partitions_dir=f"./{language_abbr}wiki_partitions", article_limit=None, delete_parsed_files=True, force_download=force_download, From a856864eb6b36b5f95c6bd1c02219d0e683cb0f8 Mon Sep 17 00:00:00 2001 From: Priyanka Pudi Date: Fri, 10 Oct 2025 15:22:46 +0530 Subject: [PATCH 3/4] fix: Completed the sax parsing for xml files --- src/scribe_data/wikipedia/extract_wiki.py | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index e43fad6c0..d3e617444 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -463,3 +463,68 @@ def __init__(self): self._values = {} self._current_tag = None self.target_articles = [] + + # Added the missing logic here ... + # Need startelement, endelement and characters to extract the text inside + # Sax must go through lines and trigger the callbacks so define them with start, end and characters + + def startElement(self, name, attrs): + """ + Handle the start of an XML element. + + Parameters + ---------- + name : str + The name of the XML element being opened. + attrs : xml.sax.xmlreader.AttributesImpl + The attributes associated with the element. + """ + if name in ("title", "text", "timestamp"): + self._current_tag = name + self._buffer = [] + + def endElement(self, name): + """ + Handle the end of an XML element. + + Parameters + ---------- + name : str + The name of the XML element being closed. + """ + if name == self._current_tag: + self._values[name] = "".join(self._buffer) + + if name == "page": + # Process the complete page + title = self._values.get("title", "") + text = self._values.get("text", "") + + # Filter out redirect pages and special pages + if ( + text + and not text.strip().startswith("#REDIRECT") + and not text.strip().startswith("#redirect") + and ":" not in title + ): # Skip namespace pages + processed_title, processed_text = _process_article(title, text) + + if processed_text and len(processed_text) > 100: # Minimum text length + self.target_articles.append([processed_title, processed_text]) + + # Reset values for next page, clear up for next page + self._values = {} + self._buffer = None + self._current_tag = None + + def characters(self, content): + """ + Handle character data within an XML element. + + Parameters + ---------- + content : str + The character data content from the XML element. + """ + if self._current_tag: + self._buffer.append(content) From 700d46e2b18b252f705fd7011403ecbe1d4c7246 Mon Sep 17 00:00:00 2001 From: Priyanka Pudi Date: Wed, 15 Oct 2025 00:04:52 +0530 Subject: [PATCH 4/4] Task: Generated Autosuggestions For English --- src/scribe_data/wikipedia/extract_wiki.py | 36 +++++++++---------- .../wikipedia/generate_autosuggestions.py | 12 ++++++- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index d3e617444..ab6badb0f 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -464,10 +464,6 @@ def __init__(self): self._current_tag = None self.target_articles = [] - # Added the missing logic here ... - # Need startelement, endelement and characters to extract the text inside - # Sax must go through lines and trigger the callbacks so define them with start, end and characters - def startElement(self, name, attrs): """ Handle the start of an XML element. @@ -500,22 +496,22 @@ def endElement(self, name): title = self._values.get("title", "") text = self._values.get("text", "") - # Filter out redirect pages and special pages - if ( - text - and not text.strip().startswith("#REDIRECT") - and not text.strip().startswith("#redirect") - and ":" not in title - ): # Skip namespace pages - processed_title, processed_text = _process_article(title, text) - - if processed_text and len(processed_text) > 100: # Minimum text length - self.target_articles.append([processed_title, processed_text]) - - # Reset values for next page, clear up for next page - self._values = {} - self._buffer = None - self._current_tag = None + # Filter out redirect pages and special pages + if ( + text + and not text.strip().startswith("#REDIRECT") + and not text.strip().startswith("#redirect") + and ":" not in title + ): # Skip namespace pages + processed_title, processed_text = _process_article(title, text) + + if processed_text and len(processed_text) > 100: # Minimum text length + self.target_articles.append([processed_title, processed_text]) + + # Reset values for next page, clear up for next page + self._values = {} + self._buffer = None + self._current_tag = None def characters(self, content): """ diff --git a/src/scribe_data/wikipedia/generate_autosuggestions.py b/src/scribe_data/wikipedia/generate_autosuggestions.py index 5f5b9520c..71b2e626a 100644 --- a/src/scribe_data/wikipedia/generate_autosuggestions.py +++ b/src/scribe_data/wikipedia/generate_autosuggestions.py @@ -73,7 +73,7 @@ def generate_autosuggestions(language, dump_id, force_download): verbose=True, ) - with open(output_path, "r") as fin: + with open(output_path, "r", encoding="utf-8") as fin: article_texts = [ json.loads(lang)[1] for lang in tqdm(fin, desc="Articles added", unit="articles") @@ -101,3 +101,13 @@ def generate_autosuggestions(language, dump_id, force_download): update_local_data=True, verbose=True, ) + + +# Uncomment to test +"""if __name__ == "__main__": + generate_autosuggestions( + language="english", + dump_id="20250520", + force_download=False, + file_limit=1 # limiting test with just 1 file + )"""