From e3be339ef113550158f3a62ff0a230290c89183f Mon Sep 17 00:00:00 2001 From: ftosoni Date: Mon, 7 Jul 2025 05:31:31 +0200 Subject: [PATCH 01/10] implement tvar & cleanup --- app.py | 1173 +++++++++++++++++++++++++++----------------------------- 1 file changed, 572 insertions(+), 601 deletions(-) diff --git a/app.py b/app.py index c22a4cc..e0c1d20 100644 --- a/app.py +++ b/app.py @@ -4,657 +4,628 @@ app = Flask(__name__) CORS(app) # Enable CORS for all routes -# Alternatively, to restrict origins: -# CORS(app, resources={r"/api/*": {"origins": "https://meta.wikimedia.org"}}) - -# Regex to check if a line already has tags -# Updated regex to detect any presence of tags (including comments and spaces) -translate_tag_pattern = re.compile(r"]*>.*?", re.DOTALL) -# Regex to match attributes like rowspan, colspan, etc. -attribute_pattern = re.compile(r"\b\w+(?!==)=([^\s|]+)") -# Regex to detect table cell separators (| and ||) -table_cell_separator_pattern = re.compile(r"(\|\||\||\*)") -# Regex to detect headers in the format of == Header == -header_pattern = re.compile(r"^(=+)(.*?)(=+)$") -# Regex to detect table header cell separators (! and !!) -header_cell_separator_pattern = re.compile(r"(!{1,2})") -# Regex to detect HTML entities (special characters) -special_char_pattern = re.compile(r"&\w+;") -# Regex for hiero, sub, sup, and math tags -# Matches text wrapped in ... -hiero_pattern = re.compile(r'(.*?)') -# Matches text wrapped in ... tags or Unicode subscript characters (e.g., ₀). -sub_pattern = re.compile(r'(.*?|̀[0-9];)') -# Matches text wrapped in ... tags or Unicode superscript characters (e.g., , ¹). -sup_pattern = re.compile(r'(.*?|̾[0-9];|&sup[0-9];)') -# Matches text wrapped in ... tags. -math_tag_pattern = re.compile(r'(.*?)') -# Matches {{math|...}} templates. -math_template_pattern = re.compile(r'(\{\{math\|.*?\}\})') -# Matches time strings in formats like "12:34", "3:45PM", or "11:00am". -time_pattern = re.compile(r'\b\d{1,2}:\d{2}(AM|PM|am|pm)?\b') -# Matches and tags. -gallery_pattern = re.compile(r'|') -# Matches occurrences of "File:". -file_pattern = re.compile(r'File:') -# Matches
tags. -br_pattern = re.compile(r'
') -# Matches magic words wrapped in double underscores (e.g., __NOTOC__). -magic_word = re.compile(r'__(.*?)__') -# Matches occurrences of the word "alt". -alt_pattern = re.compile(r'alt') -# Matches text inside double square brackets (e.g., [[example]]). -square_bracket_text_pattern = re.compile(r'\[\[(.*?)\]\]') -# Matches links with a pipe separator in double square brackets (e.g., [[link|display text]]). -square_bracket_with_pipeline_pattern = re.compile(r'\[\[([^\|\]]+)\|([^\]]+)\]\]') -# Matches occurrences of the '#' -existing_translation_pattern = re.compile(r'#') - - - -def add_translate_tags(text): - """ - Wraps the entire text in tags if it doesn't already have them, - ensuring that special characters (e.g., ì), time values (e.g., 9:30AM), - and certain tags (e.g., hiero, sub, sup, math) are not wrapped in tags. - Skips adding tags if they are already present, even with comments or special content. - """ - if not text.strip(): - return text - if re.search(r'.*', text): - return text +# --- Helper Functions for Processing Different Wikitext Elements --- +# These functions are designed to handle specific wikitext structures. +# Some will recursively call the main `convert_to_translatable_wikitext` +# function to process their internal content, ensuring nested elements +# are also handled correctly. - # If the text already has tags (including comments), do not add them - if translate_tag_pattern.search(text): +def _wrap_in_translate(text): + """ + Wraps the given text with tags. + It ensures that empty or whitespace-only strings are not wrapped. + """ + if not text or not text.strip(): return text + return f"{text}" - # If the text has any special characters, time values, or certain tags, don't wrap it in tags - if (attribute_pattern.search(text) or special_char_pattern.match(text) or - hiero_pattern.search(text) or sub_pattern.search(text) or sup_pattern.search(text) or - time_pattern.match(text) or gallery_pattern.search(text) or file_pattern.search(text) or br_pattern.search(text) or magic_word.search(text)) : # Skip time values +def process_syntax_highlight(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid syntax highlight tag" + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: return text - # Wrap the entire block of text in tags - return f'{text}' + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" -def process_math(line): +def process_table(text): """ - Processes math-related tags ({{math}}, , etc.) and ensures their content is not wrapped in tags. + Processes table blocks in the wikitext. + It wraps the content in tags. """ - # Generalized regex for math-related tags - math_patterns = [ - re.compile(r'(\{\{math\|.*?\}\})', re.DOTALL), # For {{math}} templates - re.compile(r'(.*?)', re.DOTALL) # For tags with attributes and content - ] + assert(text.startswith('{|') and text.endswith('|}')), "Invalid table tag" + return text - for pattern in math_patterns: - match = pattern.search(line) - if match: - math_content = match.group(0) - return line.replace(math_content, math_content) # Return math-related content as is +def process_blockquote(text): + """ + Processes blockquote tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('
') and text.endswith('
')), "Invalid blockquote tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - return line +def process_poem_tag(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid poem tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" -def process_table_line(line): +def process_code_tag(text): """ - Processes a single line of a table and adds tags where necessary, - ensuring that only the actual content of table cells is wrapped, not the separators. + Processes tags in the wikitext. + It wraps the content in tags. """ - if not line: - return line + assert(text.startswith('')), "Invalid code tag" + return text - if line.startswith("|+"): - # For table caption - return f'{line[:2]}{add_translate_tags(line[2:].strip()) if len(line) > 2 else ""}' - elif line.startswith("|-"): - # Table row separator - return line - elif line.startswith("!"): - # For table headers, split on ! and !! without breaking words - headers = header_cell_separator_pattern.split(line) - translated_headers = [] - for header in headers: - if header in ['!', '!!']: # Preserve the ! and !! without adding translate tags - translated_headers.append(header) - else: - # Safely process header content - processed_header = header.strip() - if processed_header: - processed_header = process_external_link(processed_header) - processed_header = process_double_name_space(processed_header) - translated_headers.append(add_translate_tags(processed_header)) - return "".join(translated_headers) - else: - # For table rows, ensure content is wrapped but separators are untouched - cells = table_cell_separator_pattern.split(line) - translated_cells = [] - for cell in cells: - if cell in ['|', '||', '*']: # Leave separators as is - translated_cells.append(cell) - elif cell and cell.startswith("[["): - # Process wiki links using process_double_name_space - processed_cell = process_double_name_space(cell) - translated_cells.append(processed_cell) - elif cell and cell.startswith("http"): - # Process external links - processed_cell = process_external_link(cell) - translated_cells.append(processed_cell) - elif cell and cell.startswith("{{"): - # Process double curly braces - processed_cell = process_doublecurly(cell) - translated_cells.append(processed_cell) - elif cell: - translated_cells.append(add_translate_tags(cell.strip())) - return "".join(translated_cells) - -def process_div(line): - """ - Processes any
tag and adds tags around the text content inside the div, - while keeping the div structure and attributes intact. - """ - # Regex pattern to detect
tags - div_pattern = re.compile(r'(]*>)(.*?)(
)', re.DOTALL) - match = div_pattern.search(line) +def process_div(text): + """ + Processes
tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid div tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - if match: - opening_div_tag = match.group(1) #
- div_content = match.group(2) # Text or content inside the div - closing_div_tag = match.group(3) #
+def process_hiero(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid hiero tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - # Wrap only the text content inside the div with tags - translated_content = add_translate_tags(div_content.strip()) +def process_sub_sup(text): + """ + Processes and tags in the wikitext. + It wraps the content in tags. + """ + assert((text.startswith('') and text.endswith('')) or + (text.startswith('') and text.endswith(''))), "Invalid sub/sup tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - return f'{opening_div_tag}{translated_content}{closing_div_tag}' - return line +def process_math(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid math tag" + return text -def process_header(line): - match = header_pattern.match(line) - if match: - translated_header_text = add_translate_tags(match.group()) - return translated_header_text - return line +def process_small_tag(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid small tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" +def process_nowiki(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid nowiki tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" -def process_double_name_space(line): +def process_item(text): """ - Double Name space (e.g., [[link/eg]]) and adds tags around the eg text. - Also handles simple internal links by adding Special:MyLanguage prefix. - Properly handles text that appears after closing brackets by adding translate tags. - Does not put translate tags around colons. + Processes list items in the wikitext. + It wraps the content in tags. """ - if 'Special:MyLanguage/' in line: + offset = 0 + if text.startswith(';'): + offset = 1 + elif text.startswith(':'): + offset = 1 + elif text.startswith('#'): + while text[offset] == '#': + offset += 1 + elif text.startswith('*'): + while text[offset] == '*': + offset += 1 + # Add translate tags around the item content + item_content = text[offset:].strip() + if not item_content: + return text + return f"{text[:offset]} {item_content}\n" + +def _process_file(s) : + # Define keywords that should NOT be translated when found as parameters + NON_TRANSLATABLE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', + 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom' + } + NON_TRANSLATABLE_KEYWORDS_PREFIXES = { + 'link=', 'upright=' + } + + output_parts = [] + + # The first token shall start with a file alias + # e.g., "File:Example.jpg" or "Image:Example.png" + if not tokens or not tokens[0].startswith(tuple(file_aliases)): return line - if line.lower().startswith("[[category:"): - y = line.split(']]')[0] - if not existing_translation_pattern.search(line): - return y + '{{#translation:}}]]' + # Extract the file name + filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0] + + # The first token is the file name (e.g., "File:Example.jpg") + # We substitute any occurrences of "Image:" with "File:" + output_parts.append(f'File:{filename}') + + pixel_regex = re.compile(r'\d+(?:x\d+)?px') # Matches pixel values like "100px" or "100x50px)" + for token in tokens[1:]: + # Check for 'alt=' + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + output_parts.append(f'alt={alt_text}') + # Check if the token is a known non-translatable keyword + elif token.lower() in NON_TRANSLATABLE_KEYWORDS: + output_parts.append(token) + # If the token starts with a known non-translatable prefix, keep it as is + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + output_parts.append(token) + # If the token is a pixel value, keep it as is + elif pixel_regex.match(token): + output_parts.append(token) + # Otherwise, assume it's a caption or other translatable text else: - return y + ']]' + output_parts.append(f"{token}") + + # Reconstruct the line with the transformed parts + returnline = '[[' + '|'.join(output_parts) + ']]' + return returnline - # Handle File case - if len(line) > 6 and line[0:2] == "[[" and line[2:6] == "File": - # File processing logic remains the same - returnline = "" - i = 0 - while i < len(line): - if line[i:i+4] == 'alt=': - returnline += "alt=" - i += 4 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i < len(line) and line[i] == '|': - if i+1 < len(line) and line[i+1] == ' ': - if i+3 < len(line) and line[i+2:i+4] in ('left'): - returnline += line[i] - elif i+6 < len(line) and line[i+2:i+7] in ('right','center','thumb'): - returnline += line[i] - else: - returnline += "| " - i+=2 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i+2 < len(line) and line[i+1:i+3] in ('left'): - returnline += line[i] - elif i+5 < len(line) and line[i+1:i+6] in ('right','center','thumb'): - returnline += line[i] - else: - returnline += "| " - i+=1 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i < len(line): - returnline += line[i] - i += 1 - return returnline - link_pattern = r'\[\[(.*?)\]\]' - parts = re.split(link_pattern, line) +def process_internal_link(text, tvar_id): + """ + Processes internal links in the wikitext. + It wraps the content in tags. + """ + assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]" + # Split the link into parts, handling both internal links and links with display text - result = "" - for i, part in enumerate(parts): - if i % 2 == 0: - if part.strip(): - colon_parts = re.split(r'(:)', part) - for j, cp in enumerate(colon_parts): - if cp == ':': - result += cp - elif cp.strip(): - result += f"{cp}" - else: - result += cp - else: - result += part - else: - if '|' in part: - link_target, link_text = part.split('|', 1) - if not link_target.startswith(('Category:', 'File:', 'Special:')): - result += f'[[Special:MyLanguage/{link_target}|{link_text}]]' - else: - result += f'[[{link_target}|{link_text}]]' - else: - if not part.startswith(('Category:', 'File:', 'Special:')): - result += f'[[Special:MyLanguage/{part}|{part}]]' - else: - result += f'[[{part}]]' + inner_wl = text[2:-2] # Remove the leading [[ and trailing ]] + parts = inner_wl.split('|') + + # part 0 + category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] + file_aliases = ['File:', 'file:', 'Image:', 'image:'] - return result -def process_external_link(line): + parts[0] = parts[0].strip() # Clean up the first part + # Check if the first part is a category or file alias + if parts[0].startswith(tuple(category_aliases)): + # Handle category links + cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0] + return f'[[Category:{cat_name}{{{{#translation:}}}}]]' + elif parts[0].startswith(tuple(file_aliases)): + # Handle file links + return _process_file(text) + elif parts[0].startswith('Special:'): + # Handle special pages + return f'[[{parts[0]}]]' + + # Assuming it's a regular internal link + if len(parts) == 1: + return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[0]}]]' + if len(parts) == 2 : + return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[1]}]]' + return text + +def process_external_link(text, tvar_url_id): """ Processes external links in the format [http://example.com Description] and ensures that only the description part is wrapped in tags, leaving the URL untouched. """ - match = re.match(r'(\[https?://[^\s]+)\s+([^\]]+)\]', line) + match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) if match: url_part = match.group(1) description_part = match.group(2) # Wrap only the description part in tags, leave the URL untouched - return f'{url_part} {description_part}]' - return line + return f'[{url_part} {description_part}]' + return text -def process_lists(line): - """ - Processes lists (e.g., *, #, :) by adding tags around list item content. - """ - for i in range(len(line)): - if line[i] in ['*', '#', ':', ';']: - continue - else: - words = line[i:].split("
") - for j in range(len(words)): - if "https://" in words[j]: - words[j] = f"{words[j]}" - elif "[[" in words[j]: - words[j] = process_double_name_space(words[j]) - else: - worder = words[j].split(":") - for k in range(len(worder)): - if worder[k] == '': - continue - else: - worder[k] = f"{worder[k]}" - words[j] = ":".join(worder) - - newstring = "
".join(words) - return f"{line[:i]}{newstring}" - - -def process_doublecurly(line): +def process_template(text): """ Processes the text to ensure that only the content outside of double curly braces {{ ... }} is wrapped in tags, while preserving the template content inside the braces without translating it. """ - if "{{" in line and "}}" in line: - start = line.index("{{") - end = line.index("}}") + 2 # Include the closing "}}" - inside_curly = line[start:end] - outside_curly = line[end:].strip() - if outside_curly: - return f"{line[:start]}{outside_curly}" - return inside_curly - else: - return f"{line.strip()}" - -def process_blockquote(line): - """ - Handles blockquote tags by ensuring content inside blockquote is not wrapped in tags. - """ - if "
" in line and "
" in line: - before_blockquote = line.split("
")[0].strip() - blockquote_content = line.split("
")[1].split("
")[0] - after_blockquote = line.split("
")[1].strip() - - translated_before = add_translate_tags(before_blockquote) if before_blockquote else '' - translated_after = add_translate_tags(after_blockquote) if after_blockquote else '' - - return f'{translated_before}
{blockquote_content}
{translated_after}' - elif "
" in line: - before_blockquote = line.split("
")[0].strip() - blockquote_content = line.split("
")[1].strip() - translated_before = add_translate_tags(before_blockquote) if before_blockquote else '' - return f'{translated_before}
{blockquote_content}' - elif "
" in line: - blockquote_content = line.split("
")[0].strip() - after_blockquote = line.split("
")[1].strip() - translated_after = add_translate_tags(after_blockquote) if after_blockquote else '' - return f'{blockquote_content}
{translated_after}' - else: - return line -def process_poem_tag(line, in_poem_block=False): - """ - Detects and tags and processes the text content inside the poem - by wrapping it in tags. Handles cases where only one of the tags is present. + assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag" + # Split the template content from the rest of the text + inner_content = text[2:-2] # Remove the leading {{ and trailing }} - :param line: The line of text to process. - :param in_poem_block: A flag to indicate if we are already inside a block. - :return: Processed line, and updated in_poem_block flag. - """ - opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) - closing_poem_pattern = re.compile(r'()', re.IGNORECASE) - # Case 1: Detect an opening tag (without necessarily having a closing tag) - if opening_poem_pattern.search(line) and not in_poem_block: - opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag - start_idx = line.find(opening_tag) + len(opening_tag) - poem_content = line[start_idx:].strip() # Get the content after the opening tag - # If there's a closing tag within the same line - if closing_poem_pattern.search(line): - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - end_idx = line.find(closing_tag) - - poem_content = line[start_idx:end_idx].strip() # Get content between and - - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - - # Return the fully processed line - return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(closing_tag):])}', False - - else: - # If only the opening tag is present, we are in the middle of a poem block - translated_poem_content = convert_to_translatable_wikitext(poem_content) - return f'{opening_tag}{translated_poem_content}', True - - # Case 2: Detect a closing tag without an opening tag in the same line - elif closing_poem_pattern.search(line) and not in_poem_block: - - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag - print(line.find(closing_tag)) - after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - # print(after_poem) - translated_after_poem = convert_to_translatable_wikitext(after_poem) - # Return the processed line with the closing tag - return f'{translated_poem_content}{closing_tag}{after_poem}', False - - # Case 3: We are inside a block and no closing tag is found in this line - elif in_poem_block: - print(line) - translated_poem_content = convert_to_translatable_wikitext(line.strip()) - return f'{translated_poem_content}', True - - # Case 4: No tag found, return the line as is - return line, in_poem_block -def process_small_tag(line, in_poem_block=False): - """ - Detects and tags and processes the text content inside the poem - by wrapping it in tags. Handles cases where only one of the tags is present. + # If the inner content is empty, return an empty string + if not inner_content.strip(): + return text - :param line: The line of text to process. - :param in_poem_block: A flag to indicate if we are already inside a block. - :return: Processed line, and updated in_poem_block flag. - """ - opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) - closing_poem_pattern = re.compile(r'(
)', re.IGNORECASE) - # Case 1: Detect an opening tag (without necessarily having a closing tag) - if opening_poem_pattern.search(line) and not in_poem_block: - opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag - start_idx = line.find(opening_tag) + len(opening_tag) - poem_content = line[start_idx:].strip() # Get the content after the opening tag - - # If there's a closing tag within the same line - if closing_poem_pattern.search(line): - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - end_idx = line.find(closing_tag) - poem_content = line[start_idx:end_idx].strip() # Get content between and - - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - - # Return the fully processed line - return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(""):])}', False + # Wrap the inner content in tags + return f'{{{{{inner_content}}}}}' + +def process_raw_url(text): + """ + Processes raw URLs in the wikitext. + It wraps the URL in tags. + """ + # This function assumes the text is a raw URL, e.g., "http://example.com" + # and wraps it in tags. + if not text.strip(): + return text + return f"{text.strip()}" + + +# --- Main Tokenization Logic --- - else: - # If only the opening tag is present, we are in the middle of a poem block - translated_poem_content = convert_to_translatable_wikitext(poem_content) - return f'{opening_tag}{translated_poem_content}', True - - # Case 2: Detect a closing tag without an opening tag in the same line - elif closing_poem_pattern.search(line) and not in_poem_block: - - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag - after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - # print(after_poem) - translated_after_poem = convert_to_translatable_wikitext(after_poem) - # Return the processed line with the closing tag - return f'{translated_poem_content}{closing_tag}{after_poem}', False - - # Case 3: We are inside a block and no closing tag is found in this line - elif in_poem_block: - translated_poem_content = convert_to_translatable_wikitext(line.strip()) - return f'{translated_poem_content}', True - - # Case 4: No tag found, return the line as is - return line, in_poem_block -# def process_big_tag(line, in_poem_block=False): -# """ -# Detects and tags and processes the text content inside the poem -# by wrapping it in tags. Handles cases where only one of the tags is present. - -# :param line: The line of text to process. -# :param in_poem_block: A flag to indicate if we are already inside a block. -# :return: Processed line, and updated in_poem_block flag. -# """ -# opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) -# closing_poem_pattern = re.compile(r'()', re.IGNORECASE) -# # Case 1: Detect an opening tag (without necessarily having a closing tag) -# if opening_poem_pattern.search(line) and not in_poem_block: -# opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag -# start_idx = line.find(opening_tag) + len(opening_tag) -# print(line[:start_idx]) -# poem_content = line[start_idx:].strip() # Get the content after the opening tag -# # If there's a closing tag within the same line -# if closing_poem_pattern.search(line): -# closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag -# end_idx = line.find(closing_tag) -# poem_content = line[start_idx:end_idx].strip() # Get content between and - -# # Process the poem content by adding tags -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# # Return the fully processed line -# return f'{convert_to_translatable_wikitext(line[:start_idx])}{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(""):])}', False - -# else: -# # If only the opening tag is present, we are in the middle of a poem block -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# return f'{opening_tag}{translated_poem_content}', True - -# # Case 2: Detect a closing tag without an opening tag in the same line -# elif closing_poem_pattern.search(line) and not in_poem_block: - -# closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag -# poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag -# print(line.find(closing_tag)) -# after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() -# # Process the poem content by adding tags -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# # print(after_poem) -# translated_after_poem = convert_to_translatable_wikitext(after_poem) -# # Return the processed line with the closing tag -# return f'{translated_poem_content}{closing_tag}{after_poem}', False - -# # Case 3: We are inside a block and no closing tag is found in this line -# elif in_poem_block: -# translated_poem_content = convert_to_translatable_wikitext(line.strip()) -# return f'{translated_poem_content}', True - -# # Case 4: No tag found, return the line as is -# return line, in_poem_block -def process_code_tag(line): - """ - Processes and tags and ensures that the content inside the tags is not wrapped in tags. - """ - if "" in line: - before_code = line.split("")[0].strip() - code_content = line.split("")[1].split("")[0] - after_code = line.split("")[1].strip() - - translated_before = convert_to_translatable_wikitext(before_code) if before_code else '' - translated_after = convert_to_translatable_wikitext(after_code) if after_code else '' - - return f'{translated_before}{code_content}{translated_after}' - elif "" in line: - before_code = line.split("{code_content}' - elif "" in line: - code_content = line.split("
")[0].strip() - after_code = line.split("")[1].strip() - translated_after = convert_to_translatable_wikitext(after_code) if after_code else '' - return f'{code_content}{translated_after}' - else: - return line -def process_syntax_highlights(line): - """ - Processes and tags and ensures that the content inside the tags is not wrapped in tags. - """ - if "" in line: - before_syntax = line.split("")[0].strip() - syntax_content = line.split("")[1].split("")[0] - after_syntax = line.split("")[1].strip() - - translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else '' - return f'{translated_before}{syntax_content}{after_syntax}' - elif "" in line: - before_syntax = line.split("")[1].strip() - translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else '' - return f'{translated_before}{syntax_content}' - elif "" in line: - syntax_content = line.split("")[0].strip() - after_syntax = line.split("")[1].strip() - translated_after = convert_to_translatable_wikitext(after_syntax) if after_syntax else '' - return f'{syntax_content}{translated_after}' - else: - return line def convert_to_translatable_wikitext(wikitext): - if wikitext == "": + """ + Converts standard wikitext to translatable wikitext by wrapping + translatable text with tags, while preserving and + correctly handling special wikitext elements. + This function tokenizes the entire text, not line by line. + """ + if not wikitext: return "" + + parts = [] + last = 0 + curr = 0 + text_length = len(wikitext) + + while curr < text_length : + # Syntax highlight block + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) + curr = end_pos + last = curr + continue + # Table block + pattern = '{|' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('|}', curr) + len('|}') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_table)) + curr = end_pattern + last = curr + continue + # Blockquote + pattern = '
' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('
', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_blockquote)) + curr = end_pattern + last = curr + continue + # Poem tag + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_poem_tag)) + curr = end_pattern + last = curr + continue + # Code tag + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_code_tag)) + curr = end_pattern + last = curr + continue + # Div tag + pattern = '', curr) + len('
') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_div)) + curr = end_pattern + last = curr + continue + # Hiero tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_hiero)) + curr = end_pattern + last = curr + continue + # Sub tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Sup tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Math tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_math)) + curr = end_pattern + last = curr + continue + # Small tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_small_tag)) + curr = end_pattern + last = curr + continue + # Nowiki tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_nowiki)) + curr = end_pattern + last = curr + continue + # Lists + patterns_newline = ['\n*', '\n#', '\n:', '\n;'] + if any(wikitext.startswith(p, curr) for p in patterns_newline) : + curr += 1 # Discard the newline character + parts.append((wikitext[last:curr], _wrap_in_translate)) + # Iterate through the list items + patterns = ['*', '#', ':', ';'] + while any(wikitext.startswith(p, curr) for p in patterns) : + end_pattern = wikitext.find('\n', curr) + if end_pattern == -1: + end_pattern = text_length + else : + end_pattern += 1 # Include the newline in the part + parts.append((wikitext[curr:end_pattern], process_item)) + curr = end_pattern + last = curr + continue + # Internal links + pattern = '[[' + if wikitext.startswith(pattern, curr): + # Count the number of opening double brackets '[[' and closing ']]' to find the end + end_pos = curr + 2 + bracket_count = 1 + while end_pos < text_length and bracket_count > 0: + if wikitext.startswith('[[', end_pos): + bracket_count += 1 + end_pos += 2 + elif wikitext.startswith(']]', end_pos): + bracket_count -= 1 + end_pos += 2 + else: + end_pos += 1 + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + if end_pos > curr + 2: # Ensure we have a valid link + parts.append((wikitext[curr:end_pos], process_internal_link)) + curr = end_pos + last = curr + continue + # External links + pattern = '[http' + if wikitext.startswith(pattern, curr): + # Find the end of the external link + end_pos = wikitext.find(']', curr) + if end_pos == -1: + end_pos = text_length + else : + end_pos += 1 # Include the closing ']' in the part + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos + 1], process_external_link)) + curr = end_pos + last = curr + continue + # Templates + pattern = '{{' + if wikitext.startswith(pattern, curr): + # Find the end of the template + end_pos = wikitext.find('}}', curr) + 2 + if end_pos == 1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_template)) + curr = end_pos + last = curr + continue + # Raw URLs + pattern = 'http' + if wikitext.startswith(pattern, curr): + # Find the end of the URL (space or end of string) + end_pos = wikitext.find(' ', curr) + if end_pos == -1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_raw_url)) + curr = end_pos + last = curr + continue + + curr += 1 # Move to the next character if no pattern matched + + # Add any remaining text after the last processed part + if last < text_length: + parts.append((wikitext[last:], _wrap_in_translate)) + """ - Converts standard wikitext to translatable wikitext by wrapping text with tags. - Handles tables, lists, blockquotes, divs, and ensures tags inside blockquotes are not wrapped. + print ('*' * 20) + for i, (part, handler) in enumerate(parts): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + + print ('*' * 20) """ - lines = re.split("\n",wikitext) - converted_lines = [] - in_syntax_highlight = False - in_table = False - for line in lines: - if line is not None: - line = line.strip() - - if line: - if "" in line: - # End of a syntax highlight block - closing_tag_idx = line.index("") - - # Process content before the closing tag - converted_lines.append(line[:closing_tag_idx]) - - # Append the closing syntaxhighlight tag - converted_lines.append(line[closing_tag_idx:]) - in_syntax_highlight = False # Exiting syntax highlight mode - elif in_syntax_highlight: - # Inside a syntaxhighlight block, do not process the line - converted_lines.append(line) - elif line.startswith("'''"): - converted_lines.append(process_lists(line)) - elif line.startswith("{|"): - in_table = True - converted_lines.append(line) - elif line.startswith("|}") and in_table: - in_table = False - converted_lines.append(line) - elif in_table: - converted_lines.append(process_table_line(line)) - elif header_pattern.match(line): - converted_lines.append(process_header(line)) - elif line.startswith("http"): - converted_lines.append(line) - elif line.startswith("[["): - converted_lines.append(process_double_name_space(line)) - elif line.startswith("["): - converted_lines.append(process_external_link(line)) - elif line.startswith(""): - converted_lines.append(line) - elif line.startswith("*") or line.startswith("#") or line.startswith(":") or line.startswith(";"): - converted_lines.append(process_lists(line)) - elif line.startswith("{{"): - converted_lines.append(process_doublecurly(line)) - elif "
" in line or "
" in line: - converted_lines.append(process_blockquote(line)) - elif "" in line: - converted_lines.append(process_poem_tag(line)[0]) - elif "" in line: - converted_lines.append(process_code_tag(line)) - elif ' tag - elif '
' in line: - converted_lines.append(line) - elif "" in line or "" in line: - converted_lines.append(line) # Do not add translate tags inside tag - elif sub_pattern.search(line) or sup_pattern.search(line): - converted_lines.append(line) # Do not add translate tags inside / - elif "" in line or "{{math}}" in line: - converted_lines.append(process_math(line)) # Handle math tags - elif "" in line or "" in line: - # If the line contains tags, we won't wrap them. - converted_lines.append(process_small_tag(line)[0]) + # Process links + tvar_id = 0 + tvar_url_id = 0 + for i, (part, handler) in enumerate(parts): + # Handlers for links require a tvar_id + if handler == process_internal_link: + new_part = handler(part, tvar_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate for consistency + parts[i] = (new_part, new_handler) + tvar_id += 1 + elif handler == process_external_link: + new_part = handler(part, tvar_url_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate for consistency + parts[i] = (new_part, new_handler) + tvar_url_id += 1 + + # Scan again the parts: merge consecutive parts that have the same handler, but only if the handler is _wrap_in_translate + _parts = [] + if parts: + current_part, current_handler = parts[0] + for part, handler in parts[1:]: + if handler == _wrap_in_translate and current_handler == _wrap_in_translate: + # Merge the parts + current_part += part else: - converted_lines.append(add_translate_tags(line)) - else: - converted_lines.append('') - converted_lines = [str(line) if line is not None else "" for line in converted_lines] - return '\n'.join(converted_lines) + # Add the current part to the list and start a new one + _parts.append((current_part, current_handler)) + current_part, current_handler = part, handler + # Add the last accumulated part + _parts.append((current_part, current_handler)) + + # Process the parts with their respective handlers + processed_parts = [handler(part) for part, handler in _parts] + + # Debug output + """ + print("Processed parts:") + for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + print(ppart) + print(f"---\n") + """ + + # Join the processed parts into a single string + return ''.join(processed_parts) @app.route('/') def index(): @@ -697,4 +668,4 @@ def api_convert(): }) if __name__ == '__main__': - app.run(debug=True) + app.run(debug=True) \ No newline at end of file From 578310e8d653263ee5e3c010c134c7c03f46e592 Mon Sep 17 00:00:00 2001 From: Super nabla Date: Mon, 7 Jul 2025 07:29:05 +0200 Subject: [PATCH 02/10] Fix file processing --- app.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app.py b/app.py index e0c1d20..0f0c5e7 100644 --- a/app.py +++ b/app.py @@ -229,8 +229,13 @@ def _process_file(s) : NON_TRANSLATABLE_KEYWORDS_PREFIXES = { 'link=', 'upright=' } + file_aliases = ['File:', 'file:', 'Image:', 'image:'] output_parts = [] + tokens = [] + + inner_content = s[2:-2] # Remove the leading [[ and trailing ]] + tokens = inner_content.split('|') # The first token shall start with a file alias # e.g., "File:Example.jpg" or "Image:Example.png" @@ -668,4 +673,4 @@ def api_convert(): }) if __name__ == '__main__': - app.run(debug=True) \ No newline at end of file + app.run(debug=True) From 18da47980c9076bab125a697e12b918ef549e7a6 Mon Sep 17 00:00:00 2001 From: Super nabla Date: Tue, 8 Jul 2025 06:14:08 +0200 Subject: [PATCH 03/10] Wrap tag in tvar --- app.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/app.py b/app.py index 0f0c5e7..fbf90ba 100644 --- a/app.py +++ b/app.py @@ -86,13 +86,25 @@ def process_poem_tag(text): wrapped_content = _wrap_in_translate(content) return f"{prefix}{wrapped_content}{suffix}" -def process_code_tag(text): +def process_code_tag(text, tvar_code_id): """ Processes tags in the wikitext. It wraps the content in tags. """ assert(text.startswith('')), "Invalid code tag" - return text + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = f'{content}' + return f"{prefix}{wrapped_content}{suffix}" def process_div(text): """ @@ -587,20 +599,26 @@ def convert_to_translatable_wikitext(wikitext): # Process links tvar_id = 0 tvar_url_id = 0 + tvar_code_id = 0 for i, (part, handler) in enumerate(parts): # Handlers for links require a tvar_id if handler == process_internal_link: new_part = handler(part, tvar_id) - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate for consistency + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) tvar_id += 1 elif handler == process_external_link: new_part = handler(part, tvar_url_id) - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate for consistency + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) tvar_url_id += 1 + elif handler == process_code_tag: + new_part = handler(part, tvar_code_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) + tvar_code_id += 1 - # Scan again the parts: merge consecutive parts that have the same handler, but only if the handler is _wrap_in_translate + # Scan again the parts: merge consecutive parts handled by _wrap_in_translate _parts = [] if parts: current_part, current_handler = parts[0] @@ -609,7 +627,6 @@ def convert_to_translatable_wikitext(wikitext): # Merge the parts current_part += part else: - # Add the current part to the list and start a new one _parts.append((current_part, current_handler)) current_part, current_handler = part, handler # Add the last accumulated part From c31cede9fdf1a72d584435c00606e0cb979a7871 Mon Sep 17 00:00:00 2001 From: Super nabla Date: Tue, 8 Jul 2025 10:05:24 +0200 Subject: [PATCH 04/10] Update function decsriptions --- app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index fbf90ba..cdadbd9 100644 --- a/app.py +++ b/app.py @@ -89,7 +89,7 @@ def process_poem_tag(text): def process_code_tag(text, tvar_code_id): """ Processes tags in the wikitext. - It wraps the content in tags. + It wraps the content in the tag. """ assert(text.startswith('')), "Invalid code tag" # Get inside the tag @@ -287,7 +287,7 @@ def _process_file(s) : def process_internal_link(text, tvar_id): """ Processes internal links in the wikitext. - It wraps the content in tags. + It wraps the content in tags. """ assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]" # Split the link into parts, handling both internal links and links with display text @@ -322,7 +322,7 @@ def process_internal_link(text, tvar_id): def process_external_link(text, tvar_url_id): """ Processes external links in the format [http://example.com Description] and ensures - that only the description part is wrapped in tags, leaving the URL untouched. + that the URL part is wrapped in tags. """ match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) From 89c29db88968c7a4bb4ccb82459f4baf4f92d499 Mon Sep 17 00:00:00 2001 From: Super nabla Date: Tue, 8 Jul 2025 17:53:18 +0200 Subject: [PATCH 05/10] Update app.py Improve _wrap_in_translate --- app.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index cdadbd9..ec73401 100644 --- a/app.py +++ b/app.py @@ -15,10 +15,30 @@ def _wrap_in_translate(text): """ Wraps the given text with tags. It ensures that empty or whitespace-only strings are not wrapped. + The tags are added around the non-whitespace content, + preserving leading and trailing whitespace. """ if not text or not text.strip(): return text - return f"{text}" + + # Find the first and last non-whitespace characters + first_char_index = -1 + last_char_index = -1 + for i, char in enumerate(text): + if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters + if first_char_index == -1: + first_char_index = i + last_char_index = i + + # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) + if first_char_index == -1: + return text + + leading_whitespace = text[:first_char_index] + content = text[first_char_index : last_char_index + 1] + trailing_whitespace = text[last_char_index + 1 :] + + return f"{leading_whitespace}{content}{trailing_whitespace}" def process_syntax_highlight(text): """ @@ -89,7 +109,7 @@ def process_poem_tag(text): def process_code_tag(text, tvar_code_id): """ Processes tags in the wikitext. - It wraps the content in the tag. + It wraps the content in tags. """ assert(text.startswith('')), "Invalid code tag" # Get inside the tag @@ -287,7 +307,7 @@ def _process_file(s) : def process_internal_link(text, tvar_id): """ Processes internal links in the wikitext. - It wraps the content in tags. + It wraps the content in tags. """ assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]" # Split the link into parts, handling both internal links and links with display text @@ -322,7 +342,7 @@ def process_internal_link(text, tvar_id): def process_external_link(text, tvar_url_id): """ Processes external links in the format [http://example.com Description] and ensures - that the URL part is wrapped in tags. + that only the description part is wrapped in tags, leaving the URL untouched. """ match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) From ec5e461f6982a5709fac207052cf71136d40bf0e Mon Sep 17 00:00:00 2001 From: Super nabla Date: Sat, 12 Jul 2025 17:48:17 +0200 Subject: [PATCH 06/10] Add icon parsing and more --- app.py | 155 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 130 insertions(+), 25 deletions(-) diff --git a/app.py b/app.py index ec73401..5225161 100644 --- a/app.py +++ b/app.py @@ -1,10 +1,14 @@ from flask import Flask, request, render_template, jsonify from flask_cors import CORS # Import flask-cors import re +from enum import Enum +import sys app = Flask(__name__) CORS(app) # Enable CORS for all routes +behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] + # --- Helper Functions for Processing Different Wikitext Elements --- # These functions are designed to handle specific wikitext structures. # Some will recursively call the main `convert_to_translatable_wikitext` @@ -106,7 +110,7 @@ def process_poem_tag(text): wrapped_content = _wrap_in_translate(content) return f"{prefix}{wrapped_content}{suffix}" -def process_code_tag(text, tvar_code_id): +def process_code_tag(text, tvar_code_id=0): """ Processes tags in the wikitext. It wraps the content in tags. @@ -252,43 +256,106 @@ def process_item(text): return text return f"{text[:offset]} {item_content}\n" -def _process_file(s) : +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + +class double_brackets_types(Enum): + wikilink = 1 + category = 2 + inline_icon = 3 + not_inline_icon_file = 4 + special = 5 + invalid_file = 6 + +def _process_file(s, tvar_inline_icon_id=0): # Define keywords that should NOT be translated when found as parameters NON_TRANSLATABLE_KEYWORDS = { - 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', - 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom' + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', + 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}' } NON_TRANSLATABLE_KEYWORDS_PREFIXES = { 'link=', 'upright=' } + NOT_INLINE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}' + } file_aliases = ['File:', 'file:', 'Image:', 'image:'] - output_parts = [] tokens = [] inner_content = s[2:-2] # Remove the leading [[ and trailing ]] tokens = inner_content.split('|') + tokens = [token.strip() for token in tokens] # Clean up whitespace around tokens # The first token shall start with a file alias # e.g., "File:Example.jpg" or "Image:Example.png" if not tokens or not tokens[0].startswith(tuple(file_aliases)): - return line + return line, double_brackets_types.invalid_file - # Extract the file name + # The first token is a file link filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0] + tokens[0] = f'File:{filename}' + + # Substitute 'left' with {{dirstart}} + while 'left' in tokens: + tokens[tokens.index('left')] = '{{dirstart}}' + # Substitute 'right' with {{dirend}} + while 'right' in tokens: + tokens[tokens.index('right')] = '{{dirend}}' + + ############################ + # Managing inline icons + ############################# + is_inline_icon = True + for token in tokens: + if token in NOT_INLINE_KEYWORDS: + is_inline_icon = False + break + if is_inline_icon : + # Check if it contains 'alt=' followed by an emoji + for token in tokens: + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + if not any(is_emoji_unicode(char) for char in alt_text): + is_inline_icon = False + break + if is_inline_icon: + # return something like: [[File:smiley.png|alt=🙂]] + returnline = f'[[' + '|'.join(tokens) + ']]' + return returnline, double_brackets_types.inline_icon + + ############################ + # Managing general files + ############################# + + output_parts = [] # The first token is the file name (e.g., "File:Example.jpg") # We substitute any occurrences of "Image:" with "File:" - output_parts.append(f'File:{filename}') + output_parts.append(tokens[0]) pixel_regex = re.compile(r'\d+(?:x\d+)?px') # Matches pixel values like "100px" or "100x50px)" for token in tokens[1:]: # Check for 'alt=' if token.startswith('alt='): alt_text = token[len('alt='):].strip() - output_parts.append(f'alt={alt_text}') + output_parts.append('alt='+_wrap_in_translate(alt_text)) # Check if the token is a known non-translatable keyword - elif token.lower() in NON_TRANSLATABLE_KEYWORDS: + elif token in NON_TRANSLATABLE_KEYWORDS: output_parts.append(token) # If the token starts with a known non-translatable prefix, keep it as is elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): @@ -302,14 +369,16 @@ def _process_file(s) : # Reconstruct the line with the transformed parts returnline = '[[' + '|'.join(output_parts) + ']]' - return returnline + return returnline, double_brackets_types.not_inline_icon_file -def process_internal_link(text, tvar_id): +def process_double_brackets(text, tvar_id=0): """ Processes internal links in the wikitext. It wraps the content in tags. """ - assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]" + if not (text.startswith("[[") and text.endswith("]]")) : + print(f"Input >{text}< must be wrapped in double brackets [[ ]]") + sys.exit(1) # Split the link into parts, handling both internal links and links with display text inner_wl = text[2:-2] # Remove the leading [[ and trailing ]] @@ -324,22 +393,22 @@ def process_internal_link(text, tvar_id): if parts[0].startswith(tuple(category_aliases)): # Handle category links cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0] - return f'[[Category:{cat_name}{{{{#translation:}}}}]]' + return f'[[Category:{cat_name}{{{{#translation:}}}}]]', double_brackets_types.category elif parts[0].startswith(tuple(file_aliases)): # Handle file links return _process_file(text) elif parts[0].startswith('Special:'): # Handle special pages - return f'[[{parts[0]}]]' + return f'[[{parts[0]}]]', double_brackets_types.special # Assuming it's a regular internal link if len(parts) == 1: - return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[0]}]]' + return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[0]}]]', double_brackets_types.wikilink if len(parts) == 2 : - return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[1]}]]' + return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[1]}]]', double_brackets_types.wikilink return text -def process_external_link(text, tvar_url_id): +def process_external_link(text, tvar_url_id=0): """ Processes external links in the format [http://example.com Description] and ensures that only the description part is wrapped in tags, leaving the URL untouched. @@ -381,7 +450,7 @@ def process_raw_url(text): return f"{text.strip()}" -# --- Main Tokenization Logic --- +# --- Main Tokenisation Logic --- def convert_to_translatable_wikitext(wikitext): """ @@ -399,6 +468,7 @@ def convert_to_translatable_wikitext(wikitext): text_length = len(wikitext) while curr < text_length : + found = None # Syntax highlight block pattern = '', '
', '
'] + for p in patterns: + if wikitext.startswith(p, curr): + end_pattern = curr + len(p) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], lambda x: x)) + curr = end_pattern + last = curr + found = True + break + if found: + continue # Lists patterns_newline = ['\n*', '\n#', '\n:', '\n;'] if any(wikitext.startswith(p, curr) for p in patterns_newline) : @@ -554,7 +638,7 @@ def convert_to_translatable_wikitext(wikitext): if last < curr: parts.append((wikitext[last:curr], _wrap_in_translate)) if end_pos > curr + 2: # Ensure we have a valid link - parts.append((wikitext[curr:end_pos], process_internal_link)) + parts.append((wikitext[curr:end_pos], process_double_brackets)) curr = end_pos last = curr continue @@ -599,6 +683,16 @@ def convert_to_translatable_wikitext(wikitext): curr = end_pos last = curr continue + # Behaviour switches + for switch in behaviour_switches: + if wikitext.startswith(switch, curr): + end_pos = curr + len(switch) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], lambda x: x)) + curr = end_pos + last = curr + curr += 1 # Move to the next character if no pattern matched @@ -606,7 +700,7 @@ def convert_to_translatable_wikitext(wikitext): if last < text_length: parts.append((wikitext[last:], _wrap_in_translate)) - """ + print ('*' * 20) for i, (part, handler) in enumerate(parts): print(f"--- Start element {i} with handler {handler.__name__} ---") @@ -614,17 +708,21 @@ def convert_to_translatable_wikitext(wikitext): print(f"---\n") print ('*' * 20) - """ + # Process links tvar_id = 0 tvar_url_id = 0 tvar_code_id = 0 + tvar_inline_icon_id = 0 for i, (part, handler) in enumerate(parts): # Handlers for links require a tvar_id - if handler == process_internal_link: - new_part = handler(part, tvar_id) - new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + if handler == process_double_brackets: + new_part, double_brackets_type = handler(part, tvar_id) + if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + else : + new_handler = lambda x: x # No further processing for categories and files parts[i] = (new_part, new_handler) tvar_id += 1 elif handler == process_external_link: @@ -637,6 +735,13 @@ def convert_to_translatable_wikitext(wikitext): new_handler = _wrap_in_translate # Change handler to _wrap_in_translate parts[i] = (new_part, new_handler) tvar_code_id += 1 + elif handler == process_double_brackets : + new_part, double_brackets_type = handler(part, tvar_inline_icon_id) + if double_brackets_type == double_brackets_types.inline_icon: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + tvar_inline_icon_id += 1 + else: + new_handler = lambda x: x # Scan again the parts: merge consecutive parts handled by _wrap_in_translate _parts = [] From fd99d69c8a383be0c40625a665b2794a7ac6de5e Mon Sep 17 00:00:00 2001 From: Super nabla Date: Sat, 12 Jul 2025 17:49:04 +0200 Subject: [PATCH 07/10] Update tests introducing tvars and more; see also: https://meta.wikimedia.org/wiki/Meta:Internationalization_guidelines --- tests.py | 58 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests.py b/tests.py index 5cdde08..0c9cacb 100644 --- a/tests.py +++ b/tests.py @@ -1,8 +1,8 @@ import unittest -from app import convert_to_translatable_wikitext, process_double_name_space +from app import convert_to_translatable_wikitext, process_double_brackets class TestTranslatableWikitext(unittest.TestCase): - + def test_section_headers(self): self.assertEqual( convert_to_translatable_wikitext("==HELLO=="), @@ -12,67 +12,67 @@ def test_section_headers(self): def test_file_tag_translations(self): self.assertEqual( convert_to_translatable_wikitext( - "[[File:landscape.jpg |thumb |left |alt=sunset |Photo of a beautiful landscape]]" + '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]' ), - "[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]" + '[[File:landscape.jpg|thumb|{{dirstart}}|alt=sunset|Photo of a beautiful landscape]]' ) def test_internal_and_external_links(self): self.assertEqual( convert_to_translatable_wikitext( - "This is a text with an [[internal link]] and an [https://openstreetmap.org external link]." + 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' ), - "This is a text with an [[internal link]] and an [https://openstreetmap.org external link]." + 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' ) - + def test_category_with_translation(self): self.assertEqual( convert_to_translatable_wikitext("[[Category:Wikipedia]]"), "[[Category:Wikipedia{{#translation:}}]]" ) - + def test_notoc_preserved(self): self.assertEqual( convert_to_translatable_wikitext("__NOTOC__"), "__NOTOC__" ) - + def test_simple_internal_link(self): self.assertEqual( - convert_to_translatable_wikitext("[[link]]"), - "[[Special:MyLanguage/link|link]]" + convert_to_translatable_wikitext('[[link]]'), + '[[Special:MyLanguage/Link|link]]' ) - + def test_multiline_text(self): self.assertEqual( - convert_to_translatable_wikitext(""" - hi iam charan -
- happy - """), - "\nhi iam charan\n
\nhappy\n" + convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), + '\nhi iam charan\n
\nhappy\n\n' ) - + def test_double_namespace_processing(self): self.assertEqual( - process_double_name_space( - "[[File:pretty hello word.png|alt=Hello everybody!]], [[File:smiley.png|alt=😂]] How are you?" + convert_to_translatable_wikitext( + '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ), - "[[File:pretty hello word.png| alt=Hello everybody!]], [[File:smiley.png| alt=😂]] How are you?" + '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ) + def test_double_namespace_without_list_case_1(self): self.assertEqual( - process_double_name_space( - "[[Help]]ing" + convert_to_translatable_wikitext( + '[[Help]]ing' ), - "[[Special:MyLanguage/Help|Help]]ing" + '[[Special:MyLanguage/Help|Help]]ing' ) + def test_double_namespace_without_list_case_2(self): self.assertEqual( - process_double_name_space( - "[[Help]] ing" + convert_to_translatable_wikitext( + '[[Help]] ing' ), - "[[Special:MyLanguage/Help|Help]] ing" + '[[Special:MyLanguage/Help|Help]] ing' ) + + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main(exit=False, failfast=True) From 29167eadeca65a0fc6901d212dbf6c47cc31e081 Mon Sep 17 00:00:00 2001 From: Super nabla Date: Sat, 12 Jul 2025 18:32:42 +0200 Subject: [PATCH 08/10] Add new tests --- tests.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tests.py b/tests.py index 0c9cacb..2112349 100644 --- a/tests.py +++ b/tests.py @@ -72,7 +72,121 @@ def test_double_namespace_without_list_case_2(self): ), '[[Special:MyLanguage/Help|Help]] ing' ) + + def test_template_simple(self): + self.assertEqual( + convert_to_translatable_wikitext("{{Template Name}}"), + "{{Template Name}}" + ) + + def test_template_with_parameters(self): + self.assertEqual( + convert_to_translatable_wikitext("{{Template|param1=Value 1|Value 2}}"), + "{{Template|param1=Value 1|Value 2}}" + ) + + def test_template_nested_in_text(self): + self.assertEqual( + convert_to_translatable_wikitext('Some text with {{a template here}} and more text.'), + 'Some text with {{A template here}} and more text.' + ) + + def test_nowiki_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Some text with [[Raw link]] content."), + "Some text with [[Raw link]] content." + ) + def test_blockquote_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("
This is a quote.
"), + "
This is a quote.
" + ) + + def test_poem_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Line 1\nLine 2"), + "Line 1\nLine 2" + ) + + def test_code_tag_with_tvar(self): + # Assuming process_code_tag assigns tvar names sequentially starting from 0 + self.assertEqual( + convert_to_translatable_wikitext("Here is some code for you."), + "Here is some code for you." + ) + + def test_div_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("
Div content here.
"), + "
Div content here.
" + ) + + def test_hiero_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("hieroglyphics"), + "hieroglyphics" + ) + + def test_sub_sup_tags(self): + self.assertEqual( + convert_to_translatable_wikitext("H2O and E=mc2"), + "H2O and E=mc2" + ) + + def test_math_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("x^2 + y^2 = z^2"), + "x^2 + y^2 = z^2" + ) + + def test_small_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Small text"), + "Small text" + ) + + def test_image_with_upright(self): + self.assertEqual( + convert_to_translatable_wikitext("[[File:Example.jpg|upright=1.5|A larger image]]"), + "[[File:Example.jpg|upright=1.5|A larger image]]" + ) + + def test_multiple_elements_in_one_line(self): + self.assertEqual( + convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), + 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' + ) + + def test_text_around_br_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("First line.
Second line."), + "First line.
Second line." + ) + + def test_empty_string_input(self): + self.assertEqual( + convert_to_translatable_wikitext(""), + "" + ) + + def test_whitespace_only_input(self): + self.assertEqual( + convert_to_translatable_wikitext(" \n\t "), + " \n\t " + ) + + def test_list_items(self): + self.assertEqual( + convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"), + "* Item 1\n** Sub-item 1.1\n* Item 2\n" + ) + + def test_definition_list(self): + self.assertEqual( + convert_to_translatable_wikitext(";Term\n:Definition\n:Description"), + "; Term\n: Definition\n: Description\n" + ) if __name__ == '__main__': unittest.main(exit=False, failfast=True) From f49961a0d4e4355397557791f190a7da6805b3aa Mon Sep 17 00:00:00 2001 From: Super nabla Date: Sat, 12 Jul 2025 18:33:16 +0200 Subject: [PATCH 09/10] debug item processing --- app.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/app.py b/app.py index 5225161..890c8a2 100644 --- a/app.py +++ b/app.py @@ -15,6 +15,15 @@ # function to process their internal content, ensuring nested elements # are also handled correctly. +def capitalise_first_letter(text): + """ + Capitalises the first letter of the given text. + If the text is empty or consists only of whitespace, it returns the text unchanged. + """ + if not text or not text.strip(): + return text + return text[0].upper() + text[1:] + def _wrap_in_translate(text): """ Wraps the given text with tags. @@ -254,7 +263,7 @@ def process_item(text): item_content = text[offset:].strip() if not item_content: return text - return f"{text[:offset]} {item_content}\n" + return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' def is_emoji_unicode(char): # This is a very simplified set of common emoji ranges. @@ -288,7 +297,7 @@ def _process_file(s, tvar_inline_icon_id=0): 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}' } NON_TRANSLATABLE_KEYWORDS_PREFIXES = { - 'link=', 'upright=' + 'link=', 'upright=', 'alt=' } NOT_INLINE_KEYWORDS = { 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}' @@ -327,12 +336,19 @@ def _process_file(s, tvar_inline_icon_id=0): break if is_inline_icon : # Check if it contains 'alt=' followed by an emoji - for token in tokens: + for token in tokens[1:]: if token.startswith('alt='): alt_text = token[len('alt='):].strip() if not any(is_emoji_unicode(char) for char in alt_text): is_inline_icon = False break + elif token not in NON_TRANSLATABLE_KEYWORDS: + is_inline_icon = False + break + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + is_inline_icon = False + break + if is_inline_icon: # return something like: [[File:smiley.png|alt=🙂]] returnline = f'[[' + '|'.join(tokens) + ']]' @@ -403,9 +419,9 @@ def process_double_brackets(text, tvar_id=0): # Assuming it's a regular internal link if len(parts) == 1: - return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[0]}]]', double_brackets_types.wikilink + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink if len(parts) == 2 : - return f'[[Special:MyLanguage/{parts[0].capitalize()}|{parts[1]}]]', double_brackets_types.wikilink + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink return text def process_external_link(text, tvar_url_id=0): @@ -429,14 +445,15 @@ def process_template(text): """ assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag" # Split the template content from the rest of the text - inner_content = text[2:-2] # Remove the leading {{ and trailing }} + inner_content = text[2:-2].strip() # Remove the leading {{ and trailing }} + inner_content = capitalise_first_letter(inner_content) # Capitalise the first letter of the inner content # If the inner content is empty, return an empty string - if not inner_content.strip(): + if not inner_content : return text # Wrap the inner content in tags - return f'{{{{{inner_content}}}}}' + return '{{' + inner_content + '}}' def process_raw_url(text): """ @@ -447,7 +464,7 @@ def process_raw_url(text): # and wraps it in tags. if not text.strip(): return text - return f"{text.strip()}" + return text.strip() # --- Main Tokenisation Logic --- @@ -461,6 +478,9 @@ def convert_to_translatable_wikitext(wikitext): """ if not wikitext: return "" + + # add an extra newline at the beginning, useful to process items at the beginning of the text + wikitext = '\n' + wikitext parts = [] last = 0 @@ -772,7 +792,7 @@ def convert_to_translatable_wikitext(wikitext): """ # Join the processed parts into a single string - return ''.join(processed_parts) + return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning @app.route('/') def index(): From 59cd8c5e82ec131d348349e0a5677dccef41bacd Mon Sep 17 00:00:00 2001 From: Super nabla Date: Sat, 12 Jul 2025 18:34:55 +0200 Subject: [PATCH 10/10] Cleanup debug prints --- app.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/app.py b/app.py index 890c8a2..0925546 100644 --- a/app.py +++ b/app.py @@ -24,6 +24,23 @@ def capitalise_first_letter(text): return text return text[0].upper() + text[1:] +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + def _wrap_in_translate(text): """ Wraps the given text with tags. @@ -265,23 +282,6 @@ def process_item(text): return text return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' -def is_emoji_unicode(char): - # This is a very simplified set of common emoji ranges. - # A comprehensive list would be much longer and more complex. - # See https://www.unicode.org/Public/emoji/ for full details. - if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons - return True - if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs - return True - if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols - return True - if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols - return True - if 0x2700 <= ord(char) <= 0x27BF: # Dingbats - return True - # Add more ranges as needed for full coverage - return False - class double_brackets_types(Enum): wikilink = 1 category = 2 @@ -720,7 +720,7 @@ def convert_to_translatable_wikitext(wikitext): if last < text_length: parts.append((wikitext[last:], _wrap_in_translate)) - + """ print ('*' * 20) for i, (part, handler) in enumerate(parts): print(f"--- Start element {i} with handler {handler.__name__} ---") @@ -728,7 +728,7 @@ def convert_to_translatable_wikitext(wikitext): print(f"---\n") print ('*' * 20) - + """ # Process links tvar_id = 0