From e3be339ef113550158f3a62ff0a230290c89183f Mon Sep 17 00:00:00 2001
From: ftosoni <francitosoni@gmail.com>
Date: Mon, 7 Jul 2025 05:31:31 +0200
Subject: [PATCH 01/10] implement tvar & cleanup

---
 app.py | 1173 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 572 insertions(+), 601 deletions(-)
diff --git a/app.py b/app.py
index c22a4cc..e0c1d20 100644
--- a/app.py
+++ b/app.py
@@ -4,657 +4,628 @@
 
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
-# Alternatively, to restrict origins:
-# CORS(app, resources={r"/api/*": {"origins": "https://meta.wikimedia.org"}})
-
-# Regex to check if a line already has <translate> tags
-# Updated regex to detect any presence of <translate> tags (including comments and spaces)
-translate_tag_pattern = re.compile(r"<translate\b[^>]*>.*?</translate>", re.DOTALL)
-# Regex to match attributes like rowspan, colspan, etc.
-attribute_pattern = re.compile(r"\b\w+(?!==)=([^\s|]+)")
-# Regex to detect table cell separators (| and ||)
-table_cell_separator_pattern = re.compile(r"(\|\||\||\*)")
-# Regex to detect headers in the format of == Header ==
-header_pattern = re.compile(r"^(=+)(.*?)(=+)$")
-# Regex to detect table header cell separators (! and !!)
-header_cell_separator_pattern = re.compile(r"(!{1,2})")
-# Regex to detect HTML entities (special characters)
-special_char_pattern = re.compile(r"&\w+;")
-# Regex for hiero, sub, sup, and math tags
-# Matches text wrapped in <hiero>...</hiero> 
-hiero_pattern = re.compile(r'(<hiero>.*?</hiero>)')
-# Matches text wrapped in <sub>...</sub> tags or Unicode subscript characters (e.g., &#8320;).
-sub_pattern = re.compile(r'(<sub>.*?</sub>|&#832[0-9];)')
-# Matches text wrapped in <sup>...</sup> tags or Unicode superscript characters (e.g., &#8300;, &sup1;).
-sup_pattern = re.compile(r'(<sup>.*?</sup>|&#830[0-9];|&sup[0-9];)')
-# Matches text wrapped in <math>...</math> tags.
-math_tag_pattern = re.compile(r'(<math>.*?</math>)')
-# Matches {{math|...}} templates.
-math_template_pattern = re.compile(r'(\{\{math\|.*?\}\})')
-# Matches time strings in formats like "12:34", "3:45PM", or "11:00am".
-time_pattern = re.compile(r'\b\d{1,2}:\d{2}(AM|PM|am|pm)?\b')
-# Matches <gallery> and </gallery> tags.
-gallery_pattern = re.compile(r'<gallery>|</gallery>')
-# Matches occurrences of "File:".
-file_pattern = re.compile(r'File:')
-# Matches <br> tags.
-br_pattern = re.compile(r'<br>')
-# Matches magic words wrapped in double underscores (e.g., __NOTOC__).
-magic_word = re.compile(r'__(.*?)__')
-# Matches occurrences of the word "alt".
-alt_pattern = re.compile(r'alt')
-# Matches text inside double square brackets (e.g., [[example]]).
-square_bracket_text_pattern = re.compile(r'\[\[(.*?)\]\]')
-# Matches links with a pipe separator in double square brackets (e.g., [[link|display text]]).
-square_bracket_with_pipeline_pattern = re.compile(r'\[\[([^\|\]]+)\|([^\]]+)\]\]')
-# Matches occurrences of the '#'
-existing_translation_pattern = re.compile(r'#')
-
-
-
-def add_translate_tags(text):
-    """
-    Wraps the entire text in <translate> tags if it doesn't already have them,
-    ensuring that special characters (e.g., &igrave;), time values (e.g., 9:30AM),
-    and certain tags (e.g., hiero, sub, sup, math) are not wrapped in <translate> tags.
-    Skips adding <translate> tags if they are already present, even with comments or special content.
-    """
-    if not text.strip():
-        return text
 
-    if re.search(r'<translate>.*<translate>', text):
-        return text
+# --- Helper Functions for Processing Different Wikitext Elements ---
+# These functions are designed to handle specific wikitext structures.
+# Some will recursively call the main `convert_to_translatable_wikitext`
+# function to process their internal content, ensuring nested elements
+# are also handled correctly.
 
-    # If the text already has <translate> tags (including comments), do not add them
-    if translate_tag_pattern.search(text):
+def _wrap_in_translate(text):
+    """
+    Wraps the given text with <translate> tags.
+    It ensures that empty or whitespace-only strings are not wrapped.
+    """
+    if not text or not text.strip():
         return text
+    return f"<translate>{text}</translate>"
 
-    # If the text has any special characters, time values, or certain tags, don't wrap it in <translate> tags
-    if (attribute_pattern.search(text) or special_char_pattern.match(text) or 
-        hiero_pattern.search(text) or sub_pattern.search(text) or sup_pattern.search(text) or 
-        time_pattern.match(text) or gallery_pattern.search(text) or file_pattern.search(text) or br_pattern.search(text) or magic_word.search(text)) :  # Skip time values
+def process_syntax_highlight(text):
+    """
+    Processes <syntaxhighlight> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<syntaxhighlight') and text.endswith('</syntaxhighlight>')), "Invalid syntax highlight tag"
+    # Get inside the <syntaxhighlight> tag
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
         return text
-    # Wrap the entire block of text in <translate> tags
-    return f'<translate>{text}</translate>'
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-def process_math(line):
+def process_table(text):
     """
-    Processes math-related tags ({{math}}, <math>, etc.) and ensures their content is not wrapped in <translate> tags.
+    Processes table blocks in the wikitext.
+    It wraps the content in <translate> tags.
     """
-    # Generalized regex for math-related tags
-    math_patterns = [
-        re.compile(r'(\{\{math\|.*?\}\})', re.DOTALL),  # For {{math}} templates
-        re.compile(r'(<math.*?>.*?</math>)', re.DOTALL)   # For <math> tags with attributes and content
-    ]
+    assert(text.startswith('{|') and text.endswith('|}')), "Invalid table tag"
+    return text
 
-    for pattern in math_patterns:
-        match = pattern.search(line)
-        if match:
-            math_content = match.group(0)
-            return line.replace(math_content, math_content)  # Return math-related content as is
+def process_blockquote(text):
+    """
+    Processes blockquote tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<blockquote>') and text.endswith('</blockquote>')), "Invalid blockquote tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-    return line
+def process_poem_tag(text):
+    """
+    Processes <poem> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<poem') and text.endswith('</poem>')), "Invalid poem tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-def process_table_line(line):
+def process_code_tag(text):
     """
-    Processes a single line of a table and adds <translate> tags where necessary,
-    ensuring that only the actual content of table cells is wrapped, not the separators.
+    Processes <code> tags in the wikitext.
+    It wraps the content in <translate> tags.
     """
-    if not line:
-        return line
+    assert(text.startswith('<code') and text.endswith('</code>')), "Invalid code tag"
+    return text
 
-    if line.startswith("|+"):
-        # For table caption
-        return f'{line[:2]}{add_translate_tags(line[2:].strip()) if len(line) > 2 else ""}'
-    elif line.startswith("|-"):
-        # Table row separator
-        return line
-    elif line.startswith("!"):
-        # For table headers, split on ! and !! without breaking words
-        headers = header_cell_separator_pattern.split(line)
-        translated_headers = []
-        for header in headers:
-            if header in ['!', '!!']:  # Preserve the ! and !! without adding translate tags
-                translated_headers.append(header)
-            else:
-                # Safely process header content
-                processed_header = header.strip()
-                if processed_header:
-                    processed_header = process_external_link(processed_header)
-                    processed_header = process_double_name_space(processed_header)
-                    translated_headers.append(add_translate_tags(processed_header))
-        return "".join(translated_headers)
-    else:
-        # For table rows, ensure content is wrapped but separators are untouched
-        cells = table_cell_separator_pattern.split(line)
-        translated_cells = []
-        for cell in cells:
-            if cell in ['|', '||', '*']:  # Leave separators as is
-                translated_cells.append(cell)
-            elif cell and cell.startswith("[["):
-                # Process wiki links using process_double_name_space
-                processed_cell = process_double_name_space(cell)
-                translated_cells.append(processed_cell)
-            elif cell and cell.startswith("http"):
-                # Process external links
-                processed_cell = process_external_link(cell)
-                translated_cells.append(processed_cell)
-            elif cell and cell.startswith("{{"):
-                # Process double curly braces
-                processed_cell = process_doublecurly(cell)
-                translated_cells.append(processed_cell)
-            elif cell:
-                translated_cells.append(add_translate_tags(cell.strip()))
-        return "".join(translated_cells)
-
-def process_div(line):
-    """
-    Processes any <div> tag and adds <translate> tags around the text content inside the div,
-    while keeping the div structure and attributes intact.
-    """
-    # Regex pattern to detect <div> tags
-    div_pattern = re.compile(r'(<div[^>]*>)(.*?)(</div>)', re.DOTALL)
-    match = div_pattern.search(line)
+def process_div(text):
+    """
+    Processes <div> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<div') and text.endswith('</div>')), "Invalid div tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-    if match:
-        opening_div_tag = match.group(1)  # <div ... >
-        div_content = match.group(2)  # Text or content inside the div
-        closing_div_tag = match.group(3)  # </div>
+def process_hiero(text):
+    """
+    Processes <hiero> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<hiero>') and text.endswith('</hiero>')), "Invalid hiero tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-        # Wrap only the text content inside the div with <translate> tags
-        translated_content = add_translate_tags(div_content.strip())
+def process_sub_sup(text):
+    """
+    Processes <sub> and <sup> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert((text.startswith('<sub>') and text.endswith('</sub>')) or
+           (text.startswith('<sup>') and text.endswith('</sup>'))), "Invalid sub/sup tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-        return f'{opening_div_tag}{translated_content}{closing_div_tag}'
-    return line
+def process_math(text):
+    """
+    Processes <math> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<math>') and text.endswith('</math>')), "Invalid math tag"
+    return text
 
-def process_header(line):
-    match = header_pattern.match(line)
-    if match:
-        translated_header_text = add_translate_tags(match.group())
-        return translated_header_text
-    return line
+def process_small_tag(text):
+    """
+    Processes <small> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<small>') and text.endswith('</small>')), "Invalid small tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
+def process_nowiki(text):
+    """
+    Processes <nowiki> tags in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert(text.startswith('<nowiki>') and text.endswith('</nowiki>')), "Invalid nowiki tag"
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = _wrap_in_translate(content)
+    return f"{prefix}{wrapped_content}{suffix}"
 
-def process_double_name_space(line):
+def process_item(text):
     """
-    Double Name space (e.g., [[link/eg]]) and adds <translate> tags around the eg text.
-    Also handles simple internal links by adding Special:MyLanguage prefix.
-    Properly handles text that appears after closing brackets by adding translate tags.
-    Does not put translate tags around colons.
+    Processes list items in the wikitext.
+    It wraps the content in <translate> tags.
     """
-    if 'Special:MyLanguage/' in line:  
+    offset = 0
+    if text.startswith(';'):
+        offset = 1
+    elif text.startswith(':'):
+        offset = 1
+    elif text.startswith('#'):
+        while text[offset] == '#':
+            offset += 1
+    elif text.startswith('*'):
+        while text[offset] == '*':
+            offset += 1
+    # Add translate tags around the item content
+    item_content = text[offset:].strip()
+    if not item_content:
+        return text
+    return f"{text[:offset]} <translate>{item_content}</translate>\n"
+
+def _process_file(s) :
+    # Define keywords that should NOT be translated when found as parameters
+    NON_TRANSLATABLE_KEYWORDS = {
+        'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 
+        'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom'
+    }
+    NON_TRANSLATABLE_KEYWORDS_PREFIXES = {
+        'link=', 'upright='
+    }
+
+    output_parts = []
+    
+    # The first token shall start with a file alias
+    # e.g., "File:Example.jpg" or "Image:Example.png"
+    if not tokens or not tokens[0].startswith(tuple(file_aliases)):
         return line
     
-    if line.lower().startswith("[[category:"):
-        y = line.split(']]')[0]
-        if not existing_translation_pattern.search(line):
-            return y + '{{#translation:}}]]'
+    # Extract the file name
+    filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0]
+    
+    # The first token is the file name (e.g., "File:Example.jpg")
+    # We substitute any occurrences of "Image:" with "File:"
+    output_parts.append(f'File:{filename}')
+
+    pixel_regex = re.compile(r'\d+(?:x\d+)?px')  # Matches pixel values like "100px" or "100x50px)"
+    for token in tokens[1:]:
+        # Check for 'alt='
+        if token.startswith('alt='):
+            alt_text = token[len('alt='):].strip()
+            output_parts.append(f'alt=<translate>{alt_text}</translate>')
+        # Check if the token is a known non-translatable keyword
+        elif token.lower() in NON_TRANSLATABLE_KEYWORDS:
+            output_parts.append(token)
+        # If the token starts with a known non-translatable prefix, keep it as is
+        elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES):
+            output_parts.append(token)
+        # If the token is a pixel value, keep it as is
+        elif pixel_regex.match(token):
+            output_parts.append(token)
+        # Otherwise, assume it's a caption or other translatable text
         else:
-            return y + ']]'
+            output_parts.append(f"<translate>{token}</translate>")
+
+    # Reconstruct the line with the transformed parts
+    returnline = '[[' + '|'.join(output_parts) + ']]' 
+    return returnline
     
-    # Handle File case
-    if len(line) > 6 and line[0:2] == "[[" and line[2:6] == "File":
-        # File processing logic remains the same
-        returnline = ""
-        i = 0
-        while i < len(line):
-            if line[i:i+4] == 'alt=':
-                returnline += "alt=<translate>"
-                i += 4
-                while i < len(line) and line[i] != '|' and line[i] != ']':
-                    returnline += line[i]
-                    i += 1
-                returnline += "</translate>"
-                if i < len(line):
-                    returnline += line[i]
-            else:
-                if i < len(line) and line[i] == '|':
-                    if i+1 < len(line) and line[i+1] == ' ':
-                        if i+3 < len(line) and line[i+2:i+4] in ('left'):
-                            returnline += line[i]
-                        elif i+6 < len(line) and line[i+2:i+7] in ('right','center','thumb'):
-                            returnline += line[i]
-                        else:
-                            returnline += "| <translate>"
-                            i+=2
-                            while i < len(line) and line[i] != '|' and line[i] != ']':
-                                returnline += line[i]
-                                i += 1
-                            returnline += "</translate>"
-                            if i < len(line):
-                                returnline += line[i]
-                    else:
-                        if i+2 < len(line) and line[i+1:i+3] in ('left'):
-                            returnline += line[i]
-                        elif i+5 < len(line) and line[i+1:i+6] in ('right','center','thumb'):
-                            returnline += line[i]
-                        else:
-                            returnline += "| <translate>"
-                            i+=1
-                            while i < len(line) and line[i] != '|' and line[i] != ']':
-                                returnline += line[i]
-                                i += 1
-                            returnline += "</translate>"
-                            if i < len(line):
-                                returnline += line[i]
-                else:
-                     if i < len(line):
-                         returnline += line[i]
-            i += 1
-        return returnline
-    link_pattern = r'\[\[(.*?)\]\]'
-    parts = re.split(link_pattern, line)
+def process_internal_link(text, tvar_id):
+    """
+    Processes internal links in the wikitext.
+    It wraps the content in <translate> tags.
+    """
+    assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]"
+    # Split the link into parts, handling both internal links and links with display text
     
-    result = ""
-    for i, part in enumerate(parts):
-        if i % 2 == 0:
-            if part.strip():
-                colon_parts = re.split(r'(:)', part)
-                for j, cp in enumerate(colon_parts):
-                    if cp == ':':  
-                        result += cp
-                    elif cp.strip():  
-                        result += f"<translate>{cp}</translate>"
-                    else:  
-                        result += cp
-            else:
-                result += part
-        else: 
-            if '|' in part:
-                link_target, link_text = part.split('|', 1)
-                if not link_target.startswith(('Category:', 'File:', 'Special:')):
-                    result += f'[[Special:MyLanguage/{link_target}|<translate>{link_text}</translate>]]'
-                else:
-                    result += f'[[{link_target}|<translate>{link_text}</translate>]]'
-            else:
-                if not part.startswith(('Category:', 'File:', 'Special:')):
-                    result += f'[[Special:MyLanguage/{part}|<translate>{part}</translate>]]'
-                else:
-                    result += f'[[{part}]]'
+    inner_wl = text[2:-2]  # Remove the leading [[ and trailing ]]
+    parts = inner_wl.split('|')
+    
+    # part 0
+    category_aliases = ['Category:', 'category:', 'Cat:', 'cat:']
+    file_aliases = ['File:', 'file:', 'Image:', 'image:']
     
-    return result
-def process_external_link(line):
+    parts[0] = parts[0].strip()  # Clean up the first part
+    # Check if the first part is a category or file alias
+    if parts[0].startswith(tuple(category_aliases)):
+        # Handle category links
+        cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0]
+        return f'[[Category:{cat_name}{{{{#translation:}}}}]]'
+    elif parts[0].startswith(tuple(file_aliases)):
+        # Handle file links
+        return _process_file(text)
+    elif parts[0].startswith('Special:'):
+        # Handle special pages
+        return f'[[{parts[0]}]]'
+    
+    # Assuming it's a regular internal link
+    if len(parts) == 1:
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[0]}]]'
+    if len(parts) == 2 :
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[1]}]]'
+    return text
+
+def process_external_link(text, tvar_url_id):
     """
     Processes external links in the format [http://example.com Description] and ensures
     that only the description part is wrapped in <translate> tags, leaving the URL untouched.
     """
-    match = re.match(r'(\[https?://[^\s]+)\s+([^\]]+)\]', line)
+    match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text)
 
     if match:
         url_part = match.group(1)
         description_part = match.group(2)
         # Wrap only the description part in <translate> tags, leave the URL untouched
-        return f'{url_part} <translate>{description_part}</translate>]'
-    return line
+        return f'[<tvar name=url{tvar_url_id}>{url_part}</tvar> {description_part}]'
+    return text
 
-def process_lists(line):
-    """
-    Processes lists (e.g., *, #, :) by adding <translate> tags around list item content.
-    """
-    for i in range(len(line)):
-        if line[i] in ['*', '#', ':', ';']:
-            continue
-        else:
-            words = line[i:].split("<br>") 
-            for j in range(len(words)):
-                if "https://" in words[j]: 
-                    words[j] = f"<translate>{words[j]}</translate>"
-                elif "[[" in words[j]:
-                    words[j] = process_double_name_space(words[j])
-                else: 
-                    worder = words[j].split(":")
-                    for k in range(len(worder)):
-                        if worder[k] == '':
-                            continue
-                        else:
-                            worder[k] = f"<translate>{worder[k]}</translate>"
-                    words[j] = ":".join(worder) 
-                    
-            newstring = "<br>".join(words)
-            return f"{line[:i]}{newstring}"  
-
-
-def process_doublecurly(line):
+def process_template(text):
     """
     Processes the text to ensure that only the content outside of double curly braces {{ ... }} is wrapped in <translate> tags,
     while preserving the template content inside the braces without translating it.
     """
-    if "{{" in line and "}}" in line:
-        start = line.index("{{")
-        end = line.index("}}") + 2  # Include the closing "}}"
-        inside_curly = line[start:end]
-        outside_curly = line[end:].strip()
-        if outside_curly:
-            return f"{line[:start]}<translate>{outside_curly}</translate>"
-        return inside_curly
-    else:
-        return f"<translate>{line.strip()}</translate>"
-
-def process_blockquote(line):
-    """
-    Handles blockquote tags by ensuring content inside blockquote is not wrapped in <translate> tags.
-    """
-    if "<blockquote>" in line and "</blockquote>" in line:
-        before_blockquote = line.split("<blockquote>")[0].strip()
-        blockquote_content = line.split("<blockquote>")[1].split("</blockquote>")[0]
-        after_blockquote = line.split("</blockquote>")[1].strip()
-
-        translated_before = add_translate_tags(before_blockquote) if before_blockquote else ''
-        translated_after = add_translate_tags(after_blockquote) if after_blockquote else ''
-
-        return f'{translated_before}<blockquote>{blockquote_content}</blockquote>{translated_after}'
-    elif "<blockquote>" in line:
-        before_blockquote = line.split("<blockquote>")[0].strip()
-        blockquote_content = line.split("<blockquote>")[1].strip()
-        translated_before = add_translate_tags(before_blockquote) if before_blockquote else ''
-        return f'{translated_before}<blockquote>{blockquote_content}'
-    elif "</blockquote>" in line:
-        blockquote_content = line.split("</blockquote>")[0].strip()
-        after_blockquote = line.split("</blockquote>")[1].strip()
-        translated_after = add_translate_tags(after_blockquote) if after_blockquote else ''
-        return f'{blockquote_content}</blockquote>{translated_after}'
-    else:
-        return line
-def process_poem_tag(line, in_poem_block=False):
-    """
-    Detects <poem> and </poem> tags and processes the text content inside the poem
-    by wrapping it in <translate> tags. Handles cases where only one of the tags is present.
+    assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag"
+    # Split the template content from the rest of the text
+    inner_content = text[2:-2]  # Remove the leading {{ and trailing }}
     
-    :param line: The line of text to process.
-    :param in_poem_block: A flag to indicate if we are already inside a <poem> block.
-    :return: Processed line, and updated in_poem_block flag.
-    """
-    opening_poem_pattern = re.compile(r'(<poem[^>]*>)', re.IGNORECASE)
-    closing_poem_pattern = re.compile(r'(</poem>)', re.IGNORECASE)
-    # Case 1: Detect an opening <poem> tag (without necessarily having a closing tag)
-    if opening_poem_pattern.search(line) and not in_poem_block:
-        opening_tag = opening_poem_pattern.search(line).group(1)  # Extract the opening <poem> tag
-        start_idx = line.find(opening_tag) + len(opening_tag)
-        poem_content = line[start_idx:].strip()  # Get the content after the opening tag
-        # If there's a closing tag within the same line
-        if closing_poem_pattern.search(line):
-            closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-            end_idx = line.find(closing_tag)
-
-            poem_content = line[start_idx:end_idx].strip()  # Get content between <poem> and </poem>
-
-            # Process the poem content by adding <translate> tags
-            translated_poem_content = convert_to_translatable_wikitext(poem_content)
-
-            # Return the fully processed line
-            return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(closing_tag):])}', False
-
-        else:
-            # If only the opening <poem> tag is present, we are in the middle of a poem block
-            translated_poem_content = convert_to_translatable_wikitext(poem_content)
-            return f'{opening_tag}{translated_poem_content}', True
-
-    # Case 2: Detect a closing </poem> tag without an opening tag in the same line
-    elif closing_poem_pattern.search(line) and not in_poem_block:
-
-        closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-        poem_content = line[:line.find(closing_tag)].strip()  # Get content before the closing tag
-        print(line.find(closing_tag))
-        after_poem= line[line.find(closing_tag)+len(closing_tag):].strip()
-        # Process the poem content by adding <translate> tags
-        translated_poem_content = convert_to_translatable_wikitext(poem_content)
-        # print(after_poem)
-        translated_after_poem = convert_to_translatable_wikitext(after_poem)
-        # Return the processed line with the closing </poem> tag
-        return f'{translated_poem_content}{closing_tag}{after_poem}', False
-
-    # Case 3: We are inside a <poem> block and no closing tag is found in this line
-    elif in_poem_block:
-        print(line)
-        translated_poem_content = convert_to_translatable_wikitext(line.strip())
-        return f'{translated_poem_content}', True
-
-    # Case 4: No <poem> tag found, return the line as is
-    return line, in_poem_block
-def process_small_tag(line, in_poem_block=False):
-    """
-    Detects <poem> and </poem> tags and processes the text content inside the poem
-    by wrapping it in <translate> tags. Handles cases where only one of the tags is present.
+    # If the inner content is empty, return an empty string
+    if not inner_content.strip():
+        return text
     
-    :param line: The line of text to process.
-    :param in_poem_block: A flag to indicate if we are already inside a <poem> block.
-    :return: Processed line, and updated in_poem_block flag.
-    """
-    opening_poem_pattern = re.compile(r'(<small[^>]*>)', re.IGNORECASE)
-    closing_poem_pattern = re.compile(r'(</small>)', re.IGNORECASE)
-    # Case 1: Detect an opening <poem> tag (without necessarily having a closing tag)
-    if opening_poem_pattern.search(line) and not in_poem_block:
-        opening_tag = opening_poem_pattern.search(line).group(1)  # Extract the opening <poem> tag
-        start_idx = line.find(opening_tag) + len(opening_tag)
-        poem_content = line[start_idx:].strip()  # Get the content after the opening tag
-
-        # If there's a closing tag within the same line
-        if closing_poem_pattern.search(line):
-            closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-            end_idx = line.find(closing_tag)
-            poem_content = line[start_idx:end_idx].strip()  # Get content between <poem> and </poem>
-
-            # Process the poem content by adding <translate> tags
-            translated_poem_content = convert_to_translatable_wikitext(poem_content)
-
-            # Return the fully processed line
-            return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len("</small>"):])}', False
+    # Wrap the inner content in <translate> tags
+    return f'{{{{<translate>{inner_content}</translate>}}}}'
+
+def process_raw_url(text):
+    """
+    Processes raw URLs in the wikitext.
+    It wraps the URL in <translate> tags.
+    """
+    # This function assumes the text is a raw URL, e.g., "http://example.com"
+    # and wraps it in <translate> tags.
+    if not text.strip():
+        return text
+    return f"<translate>{text.strip()}</translate>"
+
+
+# --- Main Tokenization Logic ---
 
-        else:
-            # If only the opening <poem> tag is present, we are in the middle of a poem block
-            translated_poem_content = convert_to_translatable_wikitext(poem_content)
-            return f'{opening_tag}{translated_poem_content}', True
-
-    # Case 2: Detect a closing </poem> tag without an opening tag in the same line
-    elif closing_poem_pattern.search(line) and not in_poem_block:
-
-        closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-        poem_content = line[:line.find(closing_tag)].strip()  # Get content before the closing tag
-        after_poem= line[line.find(closing_tag)+len(closing_tag):].strip()
-        # Process the poem content by adding <translate> tags
-        translated_poem_content = convert_to_translatable_wikitext(poem_content)
-        # print(after_poem)
-        translated_after_poem = convert_to_translatable_wikitext(after_poem)
-        # Return the processed line with the closing </poem> tag
-        return f'{translated_poem_content}{closing_tag}{after_poem}', False
-
-    # Case 3: We are inside a <poem> block and no closing tag is found in this line
-    elif in_poem_block:
-        translated_poem_content = convert_to_translatable_wikitext(line.strip())
-        return f'{translated_poem_content}', True
-
-    # Case 4: No <poem> tag found, return the line as is
-    return line, in_poem_block
-# def process_big_tag(line, in_poem_block=False):
-#     """
-#     Detects <poem> and </poem> tags and processes the text content inside the poem
-#     by wrapping it in <translate> tags. Handles cases where only one of the tags is present.
-    
-#     :param line: The line of text to process.
-#     :param in_poem_block: A flag to indicate if we are already inside a <poem> block.
-#     :return: Processed line, and updated in_poem_block flag.
-#     """
-#     opening_poem_pattern = re.compile(r'(<big[^>]*>)', re.IGNORECASE)
-#     closing_poem_pattern = re.compile(r'(</big>)', re.IGNORECASE)
-#     # Case 1: Detect an opening <poem> tag (without necessarily having a closing tag)
-#     if opening_poem_pattern.search(line) and not in_poem_block:
-#         opening_tag = opening_poem_pattern.search(line).group(1)  # Extract the opening <poem> tag
-#         start_idx = line.find(opening_tag) + len(opening_tag)
-#         print(line[:start_idx])
-#         poem_content = line[start_idx:].strip()  # Get the content after the opening tag
-#         # If there's a closing tag within the same line
-#         if closing_poem_pattern.search(line):
-#             closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-#             end_idx = line.find(closing_tag)
-#             poem_content = line[start_idx:end_idx].strip()  # Get content between <poem> and </poem>
-
-#             # Process the poem content by adding <translate> tags
-#             translated_poem_content = convert_to_translatable_wikitext(poem_content)
-#             # Return the fully processed line
-#             return f'{convert_to_translatable_wikitext(line[:start_idx])}{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len("</big>"):])}', False
-
-#         else:
-#             # If only the opening <poem> tag is present, we are in the middle of a poem block
-#             translated_poem_content = convert_to_translatable_wikitext(poem_content)
-#             return f'{opening_tag}{translated_poem_content}', True
-
-#     # Case 2: Detect a closing </poem> tag without an opening tag in the same line
-#     elif closing_poem_pattern.search(line) and not in_poem_block:
-
-#         closing_tag = closing_poem_pattern.search(line).group(1)  # Extract the closing </poem> tag
-#         poem_content = line[:line.find(closing_tag)].strip()  # Get content before the closing tag
-#         print(line.find(closing_tag))
-#         after_poem= line[line.find(closing_tag)+len(closing_tag):].strip()
-#         # Process the poem content by adding <translate> tags
-#         translated_poem_content = convert_to_translatable_wikitext(poem_content)
-#         # print(after_poem)
-#         translated_after_poem = convert_to_translatable_wikitext(after_poem)
-#         # Return the processed line with the closing </poem> tag
-#         return f'{translated_poem_content}{closing_tag}{after_poem}', False
-
-#     # Case 3: We are inside a <poem> block and no closing tag is found in this line
-#     elif in_poem_block:
-#         translated_poem_content = convert_to_translatable_wikitext(line.strip())
-#         return f'{translated_poem_content}', True
-
-#     # Case 4: No <poem> tag found, return the line as is
-#     return line, in_poem_block
-def process_code_tag(line):
-    """
-    Processes <code> and </code> tags and ensures that the content inside the tags is not wrapped in <translate> tags.
-    """
-    if "<code" in line and "</code>" in line:
-        before_code = line.split("<code>")[0].strip()
-        code_content = line.split("<code>")[1].split("</code>")[0]
-        after_code = line.split("</code>")[1].strip()
-
-        translated_before = convert_to_translatable_wikitext(before_code) if before_code else ''
-        translated_after = convert_to_translatable_wikitext(after_code) if after_code else ''
-
-        return f'{translated_before}<code>{code_content}</code>{translated_after}'
-    elif "<code>" in line:
-        before_code = line.split("<code")[0].strip()
-        code_content = line.split("<code")[1].strip()
-        translated_before = convert_to_translatable_wikitext(before_code) if before_code else ''
-        return f'{translated_before}<code>{code_content}'
-    elif "</code>" in line:
-        code_content = line.split("</code>")[0].strip()
-        after_code = line.split("</code>")[1].strip()
-        translated_after = convert_to_translatable_wikitext(after_code) if after_code else ''
-        return f'{code_content}</code>{translated_after}'
-    else:
-        return line
-def process_syntax_highlights(line):
-    """
-    Processes <syntaxhighlight> and </syntaxhighlight> tags and ensures that the content inside the tags is not wrapped in <translate> tags.
-    """
-    if "<syntaxhighlight" in line and "</syntaxhighlight>" in line:
-        before_syntax = line.split("<syntaxhighlight>")[0].strip()
-        syntax_content = line.split("<syntaxhighlight>")[1].split("</syntaxhighlight>")[0]
-        after_syntax = line.split("</syntaxhighlight>")[1].strip()
-
-        translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else ''
-        return f'{translated_before}<syntaxhighlight>{syntax_content}</syntaxhighlight>{after_syntax}'
-    elif "<syntaxhighlight>" in line:
-        before_syntax = line.split("<syntaxhighlight")[0].strip()
-        syntax_content = line.split("<syntaxhighlight>")[1].strip()
-        translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else ''
-        return f'{translated_before}<syntaxhighlight>{syntax_content}'
-    elif "</syntaxhighlight>" in line:
-        syntax_content = line.split("</syntaxhighlight>")[0].strip()
-        after_syntax = line.split("</syntaxhighlight>")[1].strip()
-        translated_after = convert_to_translatable_wikitext(after_syntax) if after_syntax else ''
-        return f'{syntax_content}</syntaxhighlight>{translated_after}'
-    else:
-        return line
 def convert_to_translatable_wikitext(wikitext):
-    if wikitext == "":
+    """
+    Converts standard wikitext to translatable wikitext by wrapping
+    translatable text with <translate> tags, while preserving and
+    correctly handling special wikitext elements.
+    This function tokenizes the entire text, not line by line.
+    """
+    if not wikitext:
         return ""
+
+    parts = []
+    last = 0
+    curr = 0
+    text_length = len(wikitext)
+
+    while curr < text_length :
+        # Syntax highlight block
+        pattern = '<syntaxhighlight'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</syntaxhighlight>', curr) + len('</syntaxhighlight>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_syntax_highlight))
+            curr = end_pos
+            last = curr
+            continue 
+        # Table block
+        pattern = '{|'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('|}', curr) + len('|}')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_table))
+            curr = end_pattern
+            last = curr
+            continue
+        # Blockquote
+        pattern = '<blockquote>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</blockquote>', curr) + len('</blockquote>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_blockquote))
+            curr = end_pattern
+            last = curr
+            continue
+        # Poem tag
+        pattern = '<poem'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</poem>', curr) + len('</poem>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_poem_tag))
+            curr = end_pattern
+            last = curr
+            continue
+        # Code tag
+        pattern = '<code'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</code>', curr) + len('</code>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_code_tag))
+            curr = end_pattern
+            last = curr
+            continue
+        # Div tag
+        pattern = '<div'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</div>', curr) + len('</div>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_div))
+            curr = end_pattern
+            last = curr
+            continue
+        # Hiero tag
+        pattern = '<hiero>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</hiero>', curr) + len('</hiero>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_hiero))
+            curr = end_pattern
+            last = curr
+            continue
+        # Sub tag
+        pattern = '<sub>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</sub>', curr) + len('</sub>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_sub_sup))
+            curr = end_pattern
+            last = curr
+            continue
+        # Sup tag
+        pattern = '<sup>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</sup>', curr) + len('</sup>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_sub_sup))
+            curr = end_pattern
+            last = curr
+            continue
+        # Math tag
+        pattern = '<math>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</math>', curr) + len('</math>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_math))
+            curr = end_pattern
+            last = curr
+            continue
+        # Small tag
+        pattern = '<small>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</small>', curr) + len('</small>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_small_tag))
+            curr = end_pattern
+            last = curr
+            continue
+        # Nowiki tag
+        pattern = '<nowiki>'
+        if wikitext.startswith(pattern, curr):
+            end_pattern = wikitext.find('</nowiki>', curr) + len('</nowiki>')
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pattern], process_nowiki))
+            curr = end_pattern
+            last = curr
+            continue
+        # Lists
+        patterns_newline = ['\n*', '\n#', '\n:', '\n;']
+        if any(wikitext.startswith(p, curr) for p in patterns_newline) :
+            curr += 1 # Discard the newline character
+            parts.append((wikitext[last:curr], _wrap_in_translate))
+            # Iterate through the list items
+            patterns = ['*', '#', ':', ';']
+            while any(wikitext.startswith(p, curr) for p in patterns) :
+                end_pattern = wikitext.find('\n', curr)
+                if end_pattern == -1:
+                    end_pattern = text_length
+                else :
+                    end_pattern += 1 # Include the newline in the part
+                parts.append((wikitext[curr:end_pattern], process_item))
+                curr = end_pattern
+                last = curr
+            continue
+        # Internal links
+        pattern = '[['
+        if wikitext.startswith(pattern, curr):
+            # Count the number of opening double brackets '[[' and closing ']]' to find the end
+            end_pos = curr + 2
+            bracket_count = 1
+            while end_pos < text_length and bracket_count > 0:
+                if wikitext.startswith('[[', end_pos):
+                    bracket_count += 1
+                    end_pos += 2
+                elif wikitext.startswith(']]', end_pos):
+                    bracket_count -= 1
+                    end_pos += 2
+                else:   
+                    end_pos += 1
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            if end_pos > curr + 2:  # Ensure we have a valid link
+                parts.append((wikitext[curr:end_pos], process_internal_link))
+            curr = end_pos
+            last = curr
+            continue
+        # External links
+        pattern = '[http'
+        if wikitext.startswith(pattern, curr):
+            # Find the end of the external link
+            end_pos = wikitext.find(']', curr)
+            if end_pos == -1:
+                end_pos = text_length
+            else :
+                end_pos += 1 # Include the closing ']' in the part
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pos + 1], process_external_link))
+            curr = end_pos
+            last = curr
+            continue
+        # Templates
+        pattern = '{{'
+        if wikitext.startswith(pattern, curr):
+            # Find the end of the template
+            end_pos = wikitext.find('}}', curr) + 2
+            if end_pos == 1:
+                end_pos = text_length
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pos], process_template))
+            curr = end_pos
+            last = curr
+            continue
+        # Raw URLs
+        pattern = 'http'
+        if wikitext.startswith(pattern, curr):
+            # Find the end of the URL (space or end of string)
+            end_pos = wikitext.find(' ', curr)
+            if end_pos == -1:
+                end_pos = text_length
+            if last < curr:
+                parts.append((wikitext[last:curr], _wrap_in_translate))
+            parts.append((wikitext[curr:end_pos], process_raw_url))
+            curr = end_pos
+            last = curr
+            continue
+        
+        curr += 1  # Move to the next character if no pattern matched
+        
+    # Add any remaining text after the last processed part
+    if last < text_length:
+        parts.append((wikitext[last:], _wrap_in_translate))
+    
     """
-    Converts standard wikitext to translatable wikitext by wrapping text with <translate> tags.
-    Handles tables, lists, blockquotes, divs, and ensures tags inside blockquotes are not wrapped.
+    print ('*' * 20)
+    for i, (part, handler) in enumerate(parts):
+        print(f"--- Start element {i} with handler {handler.__name__} ---")
+        print(part) 
+        print(f"---\n") 
+        
+    print ('*' * 20)
     """
-    lines = re.split("\n",wikitext)
-    converted_lines = []
-    in_syntax_highlight = False
-    in_table = False
-    for line in lines:
-        if line is not None:
-            line = line.strip()
     
-
-        if line:
-            if "<syntaxhighlight" in line:
-                # Start of a syntax highlight block
-                in_syntax_highlight = True
-                opening_tag_idx = line.index("<syntaxhighlight")
-                # Process content before the opening tag
-                converted_lines.append(convert_to_translatable_wikitext(line[:opening_tag_idx]))
-                
-                # Append the syntaxhighlight block as it is
-                converted_lines.append(line[opening_tag_idx:])
-            elif "</syntaxhighlight>" in line:
-                # End of a syntax highlight block
-                closing_tag_idx = line.index("</syntaxhighlight>")
-                
-                # Process content before the closing tag
-                converted_lines.append(line[:closing_tag_idx])
-                
-                # Append the closing syntaxhighlight tag
-                converted_lines.append(line[closing_tag_idx:])
-                in_syntax_highlight = False  # Exiting syntax highlight mode
-            elif in_syntax_highlight:
-                # Inside a syntaxhighlight block, do not process the line
-                converted_lines.append(line)
-            elif line.startswith("'''"):
-                converted_lines.append(process_lists(line))
-            elif line.startswith("{|"):
-                in_table = True
-                converted_lines.append(line)
-            elif line.startswith("|}") and in_table:
-                in_table = False
-                converted_lines.append(line)
-            elif in_table:
-                converted_lines.append(process_table_line(line))
-            elif header_pattern.match(line):
-                converted_lines.append(process_header(line))
-            elif line.startswith("http"):
-                converted_lines.append(line)
-            elif line.startswith("[["):
-                converted_lines.append(process_double_name_space(line))
-            elif line.startswith("["):
-                converted_lines.append(process_external_link(line))
-            elif line.startswith("<nowiki>"):
-                converted_lines.append(line)
-            elif line.startswith("*") or line.startswith("#") or line.startswith(":") or line.startswith(";"):
-                converted_lines.append(process_lists(line))
-            elif line.startswith("{{"):
-                converted_lines.append(process_doublecurly(line))
-            elif "<blockquote>" in line or "</blockquote>" in line:
-                converted_lines.append(process_blockquote(line))
-            elif "<poem" in line or "</poem>" in line:
-                converted_lines.append(process_poem_tag(line)[0])
-            elif "<code" in line or "</code>" in line:
-                converted_lines.append(process_code_tag(line))
-            elif '<div' in line:
-                converted_lines.append(process_div(line))  # Handle any <div> tag
-            elif '</div>' in line:
-                converted_lines.append(line)
-            elif "<hiero>" in line or "</hiero>" in line:
-                converted_lines.append(line)  # Do not add translate tags inside <hiero> tag
-            elif sub_pattern.search(line) or sup_pattern.search(line):
-                converted_lines.append(line)  # Do not add translate tags inside <sub>/<sup>
-            elif "<math>" in line or "{{math}}" in line:
-                converted_lines.append(process_math(line))  # Handle math tags
-            elif "<small>" in line or "</small>" in line:
-        # If the line contains <small> tags, we won't wrap them.
-                converted_lines.append(process_small_tag(line)[0])
+    # Process links
+    tvar_id = 0
+    tvar_url_id = 0
+    for i, (part, handler) in enumerate(parts):
+        # Handlers for links require a tvar_id
+        if handler == process_internal_link:
+            new_part = handler(part, tvar_id)
+            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate for consistency
+            parts[i] = (new_part, new_handler)
+            tvar_id += 1
+        elif handler == process_external_link:
+            new_part = handler(part, tvar_url_id)
+            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate for consistency
+            parts[i] = (new_part, new_handler)
+            tvar_url_id += 1
+            
+    # Scan again the parts: merge consecutive parts that have the same handler, but only if the handler is _wrap_in_translate
+    _parts = []
+    if parts:
+        current_part, current_handler = parts[0]
+        for part, handler in parts[1:]:
+            if handler == _wrap_in_translate and current_handler == _wrap_in_translate:
+                # Merge the parts
+                current_part += part
             else:
-                converted_lines.append(add_translate_tags(line))
-        else:
-            converted_lines.append('')
-        converted_lines = [str(line) if line is not None else "" for line in converted_lines]
-    return '\n'.join(converted_lines)
+                # Add the current part to the list and start a new one
+                _parts.append((current_part, current_handler))
+                current_part, current_handler = part, handler
+        # Add the last accumulated part
+        _parts.append((current_part, current_handler))
+        
+    # Process the parts with their respective handlers
+    processed_parts = [handler(part) for part, handler in _parts]            
+    
+    # Debug output
+    """
+    print("Processed parts:")
+    for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)):
+        print(f"--- Start element {i} with handler {handler.__name__} ---")
+        print(part)
+        print(f"---\n") 
+        print(ppart)  
+        print(f"---\n") 
+    """
+    
+    # Join the processed parts into a single string
+    return ''.join(processed_parts)
 
 @app.route('/')
 def index():
@@ -697,4 +668,4 @@ def api_convert():
         })
 
 if __name__ == '__main__':
-    app.run(debug=True)
+    app.run(debug=True)
\ No newline at end of file

From 578310e8d653263ee5e3c010c134c7c03f46e592 Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Mon, 7 Jul 2025 07:29:05 +0200
Subject: [PATCH 02/10] Fix file processing

---
 app.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/app.py b/app.py
index e0c1d20..0f0c5e7 100644
--- a/app.py
+++ b/app.py
@@ -229,8 +229,13 @@ def _process_file(s) :
     NON_TRANSLATABLE_KEYWORDS_PREFIXES = {
         'link=', 'upright='
     }
+    file_aliases = ['File:', 'file:', 'Image:', 'image:']
 
     output_parts = []
+    tokens = []
+    
+    inner_content = s[2:-2]  # Remove the leading [[ and trailing ]]
+    tokens = inner_content.split('|')
     
     # The first token shall start with a file alias
     # e.g., "File:Example.jpg" or "Image:Example.png"
@@ -668,4 +673,4 @@ def api_convert():
         })
 
 if __name__ == '__main__':
-    app.run(debug=True)
\ No newline at end of file
+    app.run(debug=True)

From 18da47980c9076bab125a697e12b918ef549e7a6 Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Tue, 8 Jul 2025 06:14:08 +0200
Subject: [PATCH 03/10] Wrap <code> tag in tvar

---
 app.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/app.py b/app.py
index 0f0c5e7..fbf90ba 100644
--- a/app.py
+++ b/app.py
@@ -86,13 +86,25 @@ def process_poem_tag(text):
     wrapped_content = _wrap_in_translate(content)
     return f"{prefix}{wrapped_content}{suffix}"
 
-def process_code_tag(text):
+def process_code_tag(text, tvar_code_id):
     """
     Processes <code> tags in the wikitext.
     It wraps the content in <translate> tags.
     """
     assert(text.startswith('<code') and text.endswith('</code>')), "Invalid code tag"
-    return text
+    # Get inside the <code> tag
+    start_tag_end = text.find('>') + 1
+    end_tag_start = text.rfind('<')
+    if start_tag_end >= end_tag_start:
+        return text 
+    prefix = text[:start_tag_end]
+    content = text[start_tag_end:end_tag_start].strip()
+    suffix = text[end_tag_start:]
+    if not content:
+        return text
+    # Wrap the content in <translate> tags
+    wrapped_content = f'<tvar name=code{tvar_code_id}>{content}</tvar>'
+    return f"{prefix}{wrapped_content}{suffix}"
 
 def process_div(text):
     """
@@ -587,20 +599,26 @@ def convert_to_translatable_wikitext(wikitext):
     # Process links
     tvar_id = 0
     tvar_url_id = 0
+    tvar_code_id = 0
     for i, (part, handler) in enumerate(parts):
         # Handlers for links require a tvar_id
         if handler == process_internal_link:
             new_part = handler(part, tvar_id)
-            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate for consistency
+            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
             parts[i] = (new_part, new_handler)
             tvar_id += 1
         elif handler == process_external_link:
             new_part = handler(part, tvar_url_id)
-            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate for consistency
+            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
             parts[i] = (new_part, new_handler)
             tvar_url_id += 1
+        elif handler == process_code_tag:
+            new_part = handler(part, tvar_code_id)
+            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
+            parts[i] = (new_part, new_handler)
+            tvar_code_id += 1
             
-    # Scan again the parts: merge consecutive parts that have the same handler, but only if the handler is _wrap_in_translate
+    # Scan again the parts: merge consecutive parts handled by _wrap_in_translate
     _parts = []
     if parts:
         current_part, current_handler = parts[0]
@@ -609,7 +627,6 @@ def convert_to_translatable_wikitext(wikitext):
                 # Merge the parts
                 current_part += part
             else:
-                # Add the current part to the list and start a new one
                 _parts.append((current_part, current_handler))
                 current_part, current_handler = part, handler
         # Add the last accumulated part

From c31cede9fdf1a72d584435c00606e0cb979a7871 Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Tue, 8 Jul 2025 10:05:24 +0200
Subject: [PATCH 04/10] Update function decsriptions

---
 app.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app.py b/app.py
index fbf90ba..cdadbd9 100644
--- a/app.py
+++ b/app.py
@@ -89,7 +89,7 @@ def process_poem_tag(text):
 def process_code_tag(text, tvar_code_id):
     """
     Processes <code> tags in the wikitext.
-    It wraps the content in <translate> tags.
+    It wraps the content in the <tvar> tag.
     """
     assert(text.startswith('<code') and text.endswith('</code>')), "Invalid code tag"
     # Get inside the <code> tag
@@ -287,7 +287,7 @@ def _process_file(s) :
 def process_internal_link(text, tvar_id):
     """
     Processes internal links in the wikitext.
-    It wraps the content in <translate> tags.
+    It wraps the content in <tvar> tags.
     """
     assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]"
     # Split the link into parts, handling both internal links and links with display text
@@ -322,7 +322,7 @@ def process_internal_link(text, tvar_id):
 def process_external_link(text, tvar_url_id):
     """
     Processes external links in the format [http://example.com Description] and ensures
-    that only the description part is wrapped in <translate> tags, leaving the URL untouched.
+    that the URL part is wrapped in <tvar> tags.
     """
     match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text)
 

From 89c29db88968c7a4bb4ccb82459f4baf4f92d499 Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Tue, 8 Jul 2025 17:53:18 +0200
Subject: [PATCH 05/10] Update app.py

Improve _wrap_in_translate
---
 app.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/app.py b/app.py
index cdadbd9..ec73401 100644
--- a/app.py
+++ b/app.py
@@ -15,10 +15,30 @@ def _wrap_in_translate(text):
     """
     Wraps the given text with <translate> tags.
     It ensures that empty or whitespace-only strings are not wrapped.
+    The <translate> tags are added around the non-whitespace content,
+    preserving leading and trailing whitespace.
     """
     if not text or not text.strip():
         return text
-    return f"<translate>{text}</translate>"
+
+    # Find the first and last non-whitespace characters
+    first_char_index = -1
+    last_char_index = -1
+    for i, char in enumerate(text):
+        if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters
+            if first_char_index == -1:
+                first_char_index = i
+            last_char_index = i
+
+    # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness)
+    if first_char_index == -1:
+        return text
+
+    leading_whitespace = text[:first_char_index]
+    content = text[first_char_index : last_char_index + 1]
+    trailing_whitespace = text[last_char_index + 1 :]
+
+    return f"{leading_whitespace}<translate>{content}</translate>{trailing_whitespace}"
 
 def process_syntax_highlight(text):
     """
@@ -89,7 +109,7 @@ def process_poem_tag(text):
 def process_code_tag(text, tvar_code_id):
     """
     Processes <code> tags in the wikitext.
-    It wraps the content in the <tvar> tag.
+    It wraps the content in <translate> tags.
     """
     assert(text.startswith('<code') and text.endswith('</code>')), "Invalid code tag"
     # Get inside the <code> tag
@@ -287,7 +307,7 @@ def _process_file(s) :
 def process_internal_link(text, tvar_id):
     """
     Processes internal links in the wikitext.
-    It wraps the content in <tvar> tags.
+    It wraps the content in <translate> tags.
     """
     assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]"
     # Split the link into parts, handling both internal links and links with display text
@@ -322,7 +342,7 @@ def process_internal_link(text, tvar_id):
 def process_external_link(text, tvar_url_id):
     """
     Processes external links in the format [http://example.com Description] and ensures
-    that the URL part is wrapped in <tvar> tags.
+    that only the description part is wrapped in <translate> tags, leaving the URL untouched.
     """
     match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text)
 

From ec5e461f6982a5709fac207052cf71136d40bf0e Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Sat, 12 Jul 2025 17:48:17 +0200
Subject: [PATCH 06/10] Add icon parsing and more

---
 app.py | 155 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 130 insertions(+), 25 deletions(-)

diff --git a/app.py b/app.py
index ec73401..5225161 100644
--- a/app.py
+++ b/app.py
@@ -1,10 +1,14 @@
 from flask import Flask, request, render_template, jsonify
 from flask_cors import CORS  # Import flask-cors
 import re
+from enum import Enum
+import sys
 
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
 
+behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__']
+
 # --- Helper Functions for Processing Different Wikitext Elements ---
 # These functions are designed to handle specific wikitext structures.
 # Some will recursively call the main `convert_to_translatable_wikitext`
@@ -106,7 +110,7 @@ def process_poem_tag(text):
     wrapped_content = _wrap_in_translate(content)
     return f"{prefix}{wrapped_content}{suffix}"
 
-def process_code_tag(text, tvar_code_id):
+def process_code_tag(text, tvar_code_id=0):
     """
     Processes <code> tags in the wikitext.
     It wraps the content in <translate> tags.
@@ -252,43 +256,106 @@ def process_item(text):
         return text
     return f"{text[:offset]} <translate>{item_content}</translate>\n"
 
-def _process_file(s) :
+def is_emoji_unicode(char):
+    # This is a very simplified set of common emoji ranges.
+    # A comprehensive list would be much longer and more complex.
+    # See https://www.unicode.org/Public/emoji/ for full details.
+    if 0x1F600 <= ord(char) <= 0x1F64F:  # Emoticons
+        return True
+    if 0x1F300 <= ord(char) <= 0x1F5FF:  # Miscellaneous Symbols and Pictographs
+        return True
+    if 0x1F680 <= ord(char) <= 0x1F6FF:  # Transport and Map Symbols
+        return True
+    if 0x2600 <= ord(char) <= 0x26FF:    # Miscellaneous Symbols
+        return True
+    if 0x2700 <= ord(char) <= 0x27BF:    # Dingbats
+        return True
+    # Add more ranges as needed for full coverage
+    return False
+
+class double_brackets_types(Enum):
+    wikilink = 1
+    category = 2
+    inline_icon = 3
+    not_inline_icon_file = 4
+    special = 5
+    invalid_file = 6
+
+def _process_file(s, tvar_inline_icon_id=0): 
     # Define keywords that should NOT be translated when found as parameters
     NON_TRANSLATABLE_KEYWORDS = {
-        'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 
-        'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom'
+        'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', 
+        'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}'
     }
     NON_TRANSLATABLE_KEYWORDS_PREFIXES = {
         'link=', 'upright='
     }
+    NOT_INLINE_KEYWORDS = {
+        'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}'
+    }
     file_aliases = ['File:', 'file:', 'Image:', 'image:']
 
-    output_parts = []
     tokens = []
     
     inner_content = s[2:-2]  # Remove the leading [[ and trailing ]]
     tokens = inner_content.split('|')
+    tokens = [token.strip() for token in tokens]  # Clean up whitespace around tokens
     
     # The first token shall start with a file alias
     # e.g., "File:Example.jpg" or "Image:Example.png"
     if not tokens or not tokens[0].startswith(tuple(file_aliases)):
-        return line
+        return line, double_brackets_types.invalid_file
     
-    # Extract the file name
+    # The first token is a file link
     filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0]
+    tokens[0] = f'File:{filename}' 
+    
+    # Substitute 'left' with {{dirstart}}
+    while 'left' in tokens:
+        tokens[tokens.index('left')] = '{{dirstart}}'
+    # Substitute 'right' with {{dirend}}
+    while 'right' in tokens:
+        tokens[tokens.index('right')] = '{{dirend}}'
+    
+    ############################
+    # Managing inline icons
+    #############################
+    is_inline_icon = True
+    for token in tokens:
+        if token in NOT_INLINE_KEYWORDS:
+            is_inline_icon = False
+            break
+    if is_inline_icon :
+        # Check if it contains 'alt=' followed by an emoji
+        for token in tokens:
+            if token.startswith('alt='):
+                alt_text = token[len('alt='):].strip()
+                if not any(is_emoji_unicode(char) for char in alt_text):
+                    is_inline_icon = False
+                    break
+    if is_inline_icon:
+        # return something like: <tvar name="icon">[[File:smiley.png|alt=🙂]]</tvar>
+        returnline = f'<tvar name=icon{tvar_inline_icon_id}>[[' + '|'.join(tokens) + ']]</tvar>'
+        return returnline, double_brackets_types.inline_icon
+    
+    ############################
+    # Managing general files
+    #############################
+    
+    output_parts = []
     
     # The first token is the file name (e.g., "File:Example.jpg")
     # We substitute any occurrences of "Image:" with "File:"
-    output_parts.append(f'File:{filename}')
+    output_parts.append(tokens[0])
 
     pixel_regex = re.compile(r'\d+(?:x\d+)?px')  # Matches pixel values like "100px" or "100x50px)"
     for token in tokens[1:]:
         # Check for 'alt='
         if token.startswith('alt='):
             alt_text = token[len('alt='):].strip()
-            output_parts.append(f'alt=<translate>{alt_text}</translate>')
+            output_parts.append('alt='+_wrap_in_translate(alt_text))
         # Check if the token is a known non-translatable keyword
-        elif token.lower() in NON_TRANSLATABLE_KEYWORDS:
+        elif token in NON_TRANSLATABLE_KEYWORDS:
             output_parts.append(token)
         # If the token starts with a known non-translatable prefix, keep it as is
         elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES):
@@ -302,14 +369,16 @@ def _process_file(s) :
 
     # Reconstruct the line with the transformed parts
     returnline = '[[' + '|'.join(output_parts) + ']]' 
-    return returnline
+    return returnline, double_brackets_types.not_inline_icon_file
     
-def process_internal_link(text, tvar_id):
+def process_double_brackets(text, tvar_id=0):
     """
     Processes internal links in the wikitext.
     It wraps the content in <translate> tags.
     """
-    assert (text.startswith("[[") and text.endswith("]]")), "Input must be a valid wiki link format [[...]]"
+    if not (text.startswith("[[") and text.endswith("]]")) :
+        print(f"Input >{text}< must be wrapped in double brackets [[ ]]")
+        sys.exit(1)
     # Split the link into parts, handling both internal links and links with display text
     
     inner_wl = text[2:-2]  # Remove the leading [[ and trailing ]]
@@ -324,22 +393,22 @@ def process_internal_link(text, tvar_id):
     if parts[0].startswith(tuple(category_aliases)):
         # Handle category links
         cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0]
-        return f'[[Category:{cat_name}{{{{#translation:}}}}]]'
+        return f'[[Category:{cat_name}{{{{#translation:}}}}]]', double_brackets_types.category
     elif parts[0].startswith(tuple(file_aliases)):
         # Handle file links
         return _process_file(text)
     elif parts[0].startswith('Special:'):
         # Handle special pages
-        return f'[[{parts[0]}]]'
+        return f'[[{parts[0]}]]', double_brackets_types.special
     
     # Assuming it's a regular internal link
     if len(parts) == 1:
-        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[0]}]]'
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[0]}]]', double_brackets_types.wikilink
     if len(parts) == 2 :
-        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[1]}]]'
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[1]}]]', double_brackets_types.wikilink
     return text
 
-def process_external_link(text, tvar_url_id):
+def process_external_link(text, tvar_url_id=0):
     """
     Processes external links in the format [http://example.com Description] and ensures
     that only the description part is wrapped in <translate> tags, leaving the URL untouched.
@@ -381,7 +450,7 @@ def process_raw_url(text):
     return f"<translate>{text.strip()}</translate>"
 
 
-# --- Main Tokenization Logic ---
+# --- Main Tokenisation Logic ---
 
 def convert_to_translatable_wikitext(wikitext):
     """
@@ -399,6 +468,7 @@ def convert_to_translatable_wikitext(wikitext):
     text_length = len(wikitext)
 
     while curr < text_length :
+        found = None
         # Syntax highlight block
         pattern = '<syntaxhighlight'
         if wikitext.startswith(pattern, curr):
@@ -519,6 +589,20 @@ def convert_to_translatable_wikitext(wikitext):
             curr = end_pattern
             last = curr
             continue
+        # br tag
+        patterns = ['', '', '']
+        for p in patterns:
+            if wikitext.startswith(p, curr):
+                end_pattern = curr + len(p)
+                if last < curr:
+                    parts.append((wikitext[last:curr], _wrap_in_translate))
+                parts.append((wikitext[curr:end_pattern], lambda x: x))
+                curr = end_pattern
+                last = curr
+                found = True
+                break
+        if found:
+            continue
         # Lists
         patterns_newline = ['\n*', '\n#', '\n:', '\n;']
         if any(wikitext.startswith(p, curr) for p in patterns_newline) :
@@ -554,7 +638,7 @@ def convert_to_translatable_wikitext(wikitext):
             if last < curr:
                 parts.append((wikitext[last:curr], _wrap_in_translate))
             if end_pos > curr + 2:  # Ensure we have a valid link
-                parts.append((wikitext[curr:end_pos], process_internal_link))
+                parts.append((wikitext[curr:end_pos], process_double_brackets))
             curr = end_pos
             last = curr
             continue
@@ -599,6 +683,16 @@ def convert_to_translatable_wikitext(wikitext):
             curr = end_pos
             last = curr
             continue
+        # Behaviour switches
+        for switch in behaviour_switches:
+            if wikitext.startswith(switch, curr):
+                end_pos = curr + len(switch)
+                if last < curr:
+                    parts.append((wikitext[last:curr], _wrap_in_translate))
+                parts.append((wikitext[curr:end_pos], lambda x: x))
+                curr = end_pos
+                last = curr
+                
         
         curr += 1  # Move to the next character if no pattern matched
         
@@ -606,7 +700,7 @@ def convert_to_translatable_wikitext(wikitext):
     if last < text_length:
         parts.append((wikitext[last:], _wrap_in_translate))
     
-    """
+    
     print ('*' * 20)
     for i, (part, handler) in enumerate(parts):
         print(f"--- Start element {i} with handler {handler.__name__} ---")
@@ -614,17 +708,21 @@ def convert_to_translatable_wikitext(wikitext):
         print(f"---\n") 
         
     print ('*' * 20)
-    """
+    
     
     # Process links
     tvar_id = 0
     tvar_url_id = 0
     tvar_code_id = 0
+    tvar_inline_icon_id = 0
     for i, (part, handler) in enumerate(parts):
         # Handlers for links require a tvar_id
-        if handler == process_internal_link:
-            new_part = handler(part, tvar_id)
-            new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
+        if handler == process_double_brackets:
+            new_part, double_brackets_type = handler(part, tvar_id)
+            if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]:
+                new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
+            else :
+                new_handler = lambda x: x  # No further processing for categories and files
             parts[i] = (new_part, new_handler)
             tvar_id += 1
         elif handler == process_external_link:
@@ -637,6 +735,13 @@ def convert_to_translatable_wikitext(wikitext):
             new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
             parts[i] = (new_part, new_handler)
             tvar_code_id += 1
+        elif handler == process_double_brackets :
+            new_part, double_brackets_type = handler(part, tvar_inline_icon_id)
+            if double_brackets_type == double_brackets_types.inline_icon:
+                new_handler = _wrap_in_translate  # Change handler to _wrap_in_translate
+                tvar_inline_icon_id += 1
+            else:
+                new_handler = lambda x: x
             
     # Scan again the parts: merge consecutive parts handled by _wrap_in_translate
     _parts = []

From fd99d69c8a383be0c40625a665b2794a7ac6de5e Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Sat, 12 Jul 2025 17:49:04 +0200
Subject: [PATCH 07/10] Update tests introducing tvars and more; see also:
 https://meta.wikimedia.org/wiki/Meta:Internationalization_guidelines

---
 tests.py | 58 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/tests.py b/tests.py
index 5cdde08..0c9cacb 100644
--- a/tests.py
+++ b/tests.py
@@ -1,8 +1,8 @@
 import unittest
-from app import convert_to_translatable_wikitext, process_double_name_space
+from app import convert_to_translatable_wikitext, process_double_brackets
 
 class TestTranslatableWikitext(unittest.TestCase):
-    
+
     def test_section_headers(self):
         self.assertEqual(
             convert_to_translatable_wikitext("==HELLO=="),
@@ -12,67 +12,67 @@ def test_section_headers(self):
     def test_file_tag_translations(self):
         self.assertEqual(
             convert_to_translatable_wikitext(
-                "[[File:landscape.jpg |thumb |left |alt=sunset |Photo of a beautiful landscape]]"
+                '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]'
             ),
-            "[[File:landscape.jpg |thumb |left | <translate>alt=sunset </translate>|Photo of a beautiful landscape]]"
+            '[[File:landscape.jpg|thumb|{{dirstart}}|alt=<translate>sunset</translate>|<translate>Photo of a beautiful landscape</translate>]]'
         )
 
     def test_internal_and_external_links(self):
         self.assertEqual(
             convert_to_translatable_wikitext(
-                "This is a text with an [[internal link]] and an [https://openstreetmap.org external link]."
+                'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].'
             ),
-            "<translate>This is a text with an [[internal link]] and an [https://openstreetmap.org external link].</translate>"
+            '<translate>This is a text with an [[<tvar name=0>Special:MyLanguage</tvar>/Internal link|internal link]] and an [<tvar name=url0>https://openstreetmap.org</tvar> external link].</translate>'
         )
-
+    
     def test_category_with_translation(self):
         self.assertEqual(
             convert_to_translatable_wikitext("[[Category:Wikipedia]]"),
             "[[Category:Wikipedia{{#translation:}}]]"
         )
-
+    
     def test_notoc_preserved(self):
         self.assertEqual(
             convert_to_translatable_wikitext("__NOTOC__"),
             "__NOTOC__"
         )
-
+    
     def test_simple_internal_link(self):
         self.assertEqual(
-            convert_to_translatable_wikitext("[[link]]"),
-            "[[Special:MyLanguage/link|<translate>link</translate>]]"
+            convert_to_translatable_wikitext('[[link]]'),
+            '<translate>[[<tvar name=0>Special:MyLanguage</tvar>/Link|link]]</translate>'
         )
-
+    
     def test_multiline_text(self):
         self.assertEqual(
-            convert_to_translatable_wikitext("""
-                hi iam charan
-                
-                happy
-                """),
-            "\n<translate>hi iam charan</translate>\n\n<translate>happy</translate>\n" 
+            convert_to_translatable_wikitext('\nhi iam charan\n\nhappy\n\n'),
+            '\n<translate>hi iam charan</translate>\n\n<translate>happy</translate>\n\n' 
         )
-
+    
     def test_double_namespace_processing(self):
         self.assertEqual(
-            process_double_name_space(
-                "[[File:pretty hello word.png|alt=Hello everybody!]], [[File:smiley.png|alt=😂]] How are you?"
+            convert_to_translatable_wikitext(
+                '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?'
             ),
-            "[[File:pretty hello word.png| <translate>alt=Hello everybody!</translate>]], [[File:smiley.png| <translate>alt=😂</translate>]] How are you?"
+            '[[File:pretty hello word.png|alt=<translate>Hello everybody!</translate>]] <translate><tvar name=icon0>[[File:smiley.png|alt=🙂]]</tvar> How are you?</translate>'
         )
+    
     def test_double_namespace_without_list_case_1(self):
         self.assertEqual(
-            process_double_name_space(
-                "[[Help]]ing"
+            convert_to_translatable_wikitext(
+                '[[Help]]ing'
             ),
-            "[[Special:MyLanguage/Help|<translate>Help</translate>]]<translate>ing</translate>"
+            '<translate>[[<tvar name=0>Special:MyLanguage</tvar>/Help|Help]]ing</translate>'
         )
+    
     def test_double_namespace_without_list_case_2(self):
         self.assertEqual(
-            process_double_name_space(
-                "[[Help]] ing"
+            convert_to_translatable_wikitext(
+                '[[Help]] ing'
             ),
-            "[[Special:MyLanguage/Help|<translate>Help</translate>]]<translate> ing</translate>"
+            '<translate>[[<tvar name=0>Special:MyLanguage</tvar>/Help|Help]] ing</translate>'
         )
+    
+
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main(exit=False, failfast=True)

From 29167eadeca65a0fc6901d212dbf6c47cc31e081 Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Sat, 12 Jul 2025 18:32:42 +0200
Subject: [PATCH 08/10] Add new tests

---
 tests.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/tests.py b/tests.py
index 0c9cacb..2112349 100644
--- a/tests.py
+++ b/tests.py
@@ -72,7 +72,121 @@ def test_double_namespace_without_list_case_2(self):
             ),
             '<translate>[[<tvar name=0>Special:MyLanguage</tvar>/Help|Help]] ing</translate>'
         )
+
+    def test_template_simple(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("{{Template Name}}"),
+            "{{Template Name}}"
+        )
+
+    def test_template_with_parameters(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("{{Template|param1=Value 1|Value 2}}"),
+            "{{Template|param1=Value 1|Value 2}}"
+        )
+
+    def test_template_nested_in_text(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext('Some text with {{a template here}} and more text.'),
+            '<translate>Some text with</translate> {{A template here}} <translate>and more text.</translate>'
+        )
+
+    def test_nowiki_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("Some text with <nowiki>[[Raw link]]</nowiki> content."),
+            "<translate>Some text with</translate> <nowiki><translate>[[Raw link]]</translate></nowiki> <translate>content.</translate>"
+        )
     
+    def test_blockquote_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<blockquote>This is a quote.</blockquote>"),
+            "<blockquote><translate>This is a quote.</translate></blockquote>"
+        )
+
+    def test_poem_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<poem>Line 1\nLine 2</poem>"),
+            "<poem><translate>Line 1\nLine 2</translate></poem>"
+        )
+
+    def test_code_tag_with_tvar(self):
+        # Assuming process_code_tag assigns tvar names sequentially starting from 0
+        self.assertEqual(
+            convert_to_translatable_wikitext("Here is <code>some code</code> for you."),
+            "<translate>Here is <code><tvar name=code0>some code</tvar></code> for you.</translate>"
+        )
+
+    def test_div_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<div>Div content here.</div>"),
+            "<div><translate>Div content here.</translate></div>"
+        )
+
+    def test_hiero_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<hiero>hieroglyphics</hiero>"),
+            "<hiero><translate>hieroglyphics</translate></hiero>"
+        )
+
+    def test_sub_sup_tags(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("H<sub>2</sub>O and E=mc<sup>2</sup>"),
+            "<translate>H</translate><sub><translate>2</translate></sub><translate>O and E=mc</translate><sup><translate>2</translate></sup>"
+        )
+
+    def test_math_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<math>x^2 + y^2 = z^2</math>"),
+            "<math>x^2 + y^2 = z^2</math>"
+        )
+
+    def test_small_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("<small>Small text</small>"),
+            "<small><translate>Small text</translate></small>"
+        )
+  
+    def test_image_with_upright(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("[[File:Example.jpg|upright=1.5|A larger image]]"),
+            "[[File:Example.jpg|upright=1.5|<translate>A larger image</translate>]]"
+        )
+
+    def test_multiple_elements_in_one_line(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"),
+            '<translate>Hello world! [[<tvar name=0>Special:MyLanguage</tvar>/Link|Link]]</translate> {{Template}} <translate>[<tvar name=url0>https://meta.wikimedia.org/wiki/Main_Page</tvar> Home]</translate>'
+        )
+
+    def test_text_around_br_tag(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("First line.<br>Second line."),
+            "<translate>First line.</translate><br><translate>Second line.</translate>"
+        )
+
+    def test_empty_string_input(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext(""),
+            ""
+        )
+    
+    def test_whitespace_only_input(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("   \n\t "),
+            "   \n\t "
+        )
+
+    def test_list_items(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"),
+            "* <translate>Item 1</translate>\n** <translate>Sub-item 1.1</translate>\n* <translate>Item 2</translate>\n"
+        )
+
+    def test_definition_list(self):
+        self.assertEqual(
+            convert_to_translatable_wikitext(";Term\n:Definition\n:Description"),
+            "; <translate>Term</translate>\n: <translate>Definition</translate>\n: <translate>Description</translate>\n"
+        )
 
 if __name__ == '__main__':
     unittest.main(exit=False, failfast=True)

From f49961a0d4e4355397557791f190a7da6805b3aa Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Sat, 12 Jul 2025 18:33:16 +0200
Subject: [PATCH 09/10] debug item processing

---
 app.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/app.py b/app.py
index 5225161..890c8a2 100644
--- a/app.py
+++ b/app.py
@@ -15,6 +15,15 @@
 # function to process their internal content, ensuring nested elements
 # are also handled correctly.
 
+def capitalise_first_letter(text):
+    """
+    Capitalises the first letter of the given text.
+    If the text is empty or consists only of whitespace, it returns the text unchanged.
+    """
+    if not text or not text.strip():
+        return text
+    return text[0].upper() + text[1:]
+
 def _wrap_in_translate(text):
     """
     Wraps the given text with <translate> tags.
@@ -254,7 +263,7 @@ def process_item(text):
     item_content = text[offset:].strip()
     if not item_content:
         return text
-    return f"{text[:offset]} <translate>{item_content}</translate>\n"
+    return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n'
 
 def is_emoji_unicode(char):
     # This is a very simplified set of common emoji ranges.
@@ -288,7 +297,7 @@ def _process_file(s, tvar_inline_icon_id=0):
         'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}'
     }
     NON_TRANSLATABLE_KEYWORDS_PREFIXES = {
-        'link=', 'upright='
+        'link=', 'upright=', 'alt='
     }
     NOT_INLINE_KEYWORDS = {
         'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}'
@@ -327,12 +336,19 @@ def _process_file(s, tvar_inline_icon_id=0):
             break
     if is_inline_icon :
         # Check if it contains 'alt=' followed by an emoji
-        for token in tokens:
+        for token in tokens[1:]:
             if token.startswith('alt='):
                 alt_text = token[len('alt='):].strip()
                 if not any(is_emoji_unicode(char) for char in alt_text):
                     is_inline_icon = False
                     break
+            elif token not in NON_TRANSLATABLE_KEYWORDS:
+                is_inline_icon = False
+                break
+            elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES):
+                is_inline_icon = False
+                break
+        
     if is_inline_icon:
         # return something like: <tvar name="icon">[[File:smiley.png|alt=🙂]]</tvar>
         returnline = f'<tvar name=icon{tvar_inline_icon_id}>[[' + '|'.join(tokens) + ']]</tvar>'
@@ -403,9 +419,9 @@ def process_double_brackets(text, tvar_id=0):
     
     # Assuming it's a regular internal link
     if len(parts) == 1:
-        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[0]}]]', double_brackets_types.wikilink
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink
     if len(parts) == 2 :
-        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{parts[0].capitalize()}|{parts[1]}]]', double_brackets_types.wikilink
+        return f'[[<tvar name={tvar_id}>Special:MyLanguage</tvar>/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink
     return text
 
 def process_external_link(text, tvar_url_id=0):
@@ -429,14 +445,15 @@ def process_template(text):
     """
     assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag"
     # Split the template content from the rest of the text
-    inner_content = text[2:-2]  # Remove the leading {{ and trailing }}
+    inner_content = text[2:-2].strip()  # Remove the leading {{ and trailing }}
+    inner_content = capitalise_first_letter(inner_content)  # Capitalise the first letter of the inner content
     
     # If the inner content is empty, return an empty string
-    if not inner_content.strip():
+    if not inner_content :
         return text
     
     # Wrap the inner content in <translate> tags
-    return f'{{{{<translate>{inner_content}</translate>}}}}'
+    return '{{' + inner_content + '}}'
 
 def process_raw_url(text):
     """
@@ -447,7 +464,7 @@ def process_raw_url(text):
     # and wraps it in <translate> tags.
     if not text.strip():
         return text
-    return f"<translate>{text.strip()}</translate>"
+    return text.strip()
 
 
 # --- Main Tokenisation Logic ---
@@ -461,6 +478,9 @@ def convert_to_translatable_wikitext(wikitext):
     """
     if not wikitext:
         return ""
+    
+    # add an extra newline at the beginning, useful to process items at the beginning of the text
+    wikitext = '\n' + wikitext
 
     parts = []
     last = 0
@@ -772,7 +792,7 @@ def convert_to_translatable_wikitext(wikitext):
     """
     
     # Join the processed parts into a single string
-    return ''.join(processed_parts)
+    return ''.join(processed_parts)[1:]  # Remove the leading newline added at the beginning
 
 @app.route('/')
 def index():

From 59cd8c5e82ec131d348349e0a5677dccef41bacd Mon Sep 17 00:00:00 2001
From: Super nabla <supernabla@outlook.it>
Date: Sat, 12 Jul 2025 18:34:55 +0200
Subject: [PATCH 10/10] Cleanup debug prints

---
 app.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/app.py b/app.py
index 890c8a2..0925546 100644
--- a/app.py
+++ b/app.py
@@ -24,6 +24,23 @@ def capitalise_first_letter(text):
         return text
     return text[0].upper() + text[1:]
 
+def is_emoji_unicode(char):
+    # This is a very simplified set of common emoji ranges.
+    # A comprehensive list would be much longer and more complex.
+    # See https://www.unicode.org/Public/emoji/ for full details.
+    if 0x1F600 <= ord(char) <= 0x1F64F:  # Emoticons
+        return True
+    if 0x1F300 <= ord(char) <= 0x1F5FF:  # Miscellaneous Symbols and Pictographs
+        return True
+    if 0x1F680 <= ord(char) <= 0x1F6FF:  # Transport and Map Symbols
+        return True
+    if 0x2600 <= ord(char) <= 0x26FF:    # Miscellaneous Symbols
+        return True
+    if 0x2700 <= ord(char) <= 0x27BF:    # Dingbats
+        return True
+    # Add more ranges as needed for full coverage
+    return False
+
 def _wrap_in_translate(text):
     """
     Wraps the given text with <translate> tags.
@@ -265,23 +282,6 @@ def process_item(text):
         return text
     return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n'
 
-def is_emoji_unicode(char):
-    # This is a very simplified set of common emoji ranges.
-    # A comprehensive list would be much longer and more complex.
-    # See https://www.unicode.org/Public/emoji/ for full details.
-    if 0x1F600 <= ord(char) <= 0x1F64F:  # Emoticons
-        return True
-    if 0x1F300 <= ord(char) <= 0x1F5FF:  # Miscellaneous Symbols and Pictographs
-        return True
-    if 0x1F680 <= ord(char) <= 0x1F6FF:  # Transport and Map Symbols
-        return True
-    if 0x2600 <= ord(char) <= 0x26FF:    # Miscellaneous Symbols
-        return True
-    if 0x2700 <= ord(char) <= 0x27BF:    # Dingbats
-        return True
-    # Add more ranges as needed for full coverage
-    return False
-
 class double_brackets_types(Enum):
     wikilink = 1
     category = 2
@@ -720,7 +720,7 @@ def convert_to_translatable_wikitext(wikitext):
     if last < text_length:
         parts.append((wikitext[last:], _wrap_in_translate))
     
-    
+    """
     print ('*' * 20)
     for i, (part, handler) in enumerate(parts):
         print(f"--- Start element {i} with handler {handler.__name__} ---")
@@ -728,7 +728,7 @@ def convert_to_translatable_wikitext(wikitext):
         print(f"---\n") 
         
     print ('*' * 20)
-    
+    """
     
     # Process links
     tvar_id = 0