diff --git a/app.py b/app.py index c22a4cc..0925546 100644 --- a/app.py +++ b/app.py @@ -1,660 +1,798 @@ from flask import Flask, request, render_template, jsonify from flask_cors import CORS # Import flask-cors import re +from enum import Enum +import sys app = Flask(__name__) CORS(app) # Enable CORS for all routes -# Alternatively, to restrict origins: -# CORS(app, resources={r"/api/*": {"origins": "https://meta.wikimedia.org"}}) - -# Regex to check if a line already has tags -# Updated regex to detect any presence of tags (including comments and spaces) -translate_tag_pattern = re.compile(r"]*>.*?", re.DOTALL) -# Regex to match attributes like rowspan, colspan, etc. -attribute_pattern = re.compile(r"\b\w+(?!==)=([^\s|]+)") -# Regex to detect table cell separators (| and ||) -table_cell_separator_pattern = re.compile(r"(\|\||\||\*)") -# Regex to detect headers in the format of == Header == -header_pattern = re.compile(r"^(=+)(.*?)(=+)$") -# Regex to detect table header cell separators (! and !!) -header_cell_separator_pattern = re.compile(r"(!{1,2})") -# Regex to detect HTML entities (special characters) -special_char_pattern = re.compile(r"&\w+;") -# Regex for hiero, sub, sup, and math tags -# Matches text wrapped in ... -hiero_pattern = re.compile(r'(.*?)') -# Matches text wrapped in ... tags or Unicode subscript characters (e.g., ₀). -sub_pattern = re.compile(r'(.*?|̀[0-9];)') -# Matches text wrapped in ... tags or Unicode superscript characters (e.g., , ¹). -sup_pattern = re.compile(r'(.*?|̾[0-9];|&sup[0-9];)') -# Matches text wrapped in ... tags. -math_tag_pattern = re.compile(r'(.*?)') -# Matches {{math|...}} templates. -math_template_pattern = re.compile(r'(\{\{math\|.*?\}\})') -# Matches time strings in formats like "12:34", "3:45PM", or "11:00am". -time_pattern = re.compile(r'\b\d{1,2}:\d{2}(AM|PM|am|pm)?\b') -# Matches and tags. -gallery_pattern = re.compile(r'|') -# Matches occurrences of "File:". -file_pattern = re.compile(r'File:') -# Matches
tags. -br_pattern = re.compile(r'
') -# Matches magic words wrapped in double underscores (e.g., __NOTOC__). -magic_word = re.compile(r'__(.*?)__') -# Matches occurrences of the word "alt". -alt_pattern = re.compile(r'alt') -# Matches text inside double square brackets (e.g., [[example]]). -square_bracket_text_pattern = re.compile(r'\[\[(.*?)\]\]') -# Matches links with a pipe separator in double square brackets (e.g., [[link|display text]]). -square_bracket_with_pipeline_pattern = re.compile(r'\[\[([^\|\]]+)\|([^\]]+)\]\]') -# Matches occurrences of the '#' -existing_translation_pattern = re.compile(r'#') - - - -def add_translate_tags(text): - """ - Wraps the entire text in tags if it doesn't already have them, - ensuring that special characters (e.g., ì), time values (e.g., 9:30AM), - and certain tags (e.g., hiero, sub, sup, math) are not wrapped in tags. - Skips adding tags if they are already present, even with comments or special content. + +behaviour_switches = ['__NOTOC__', '__FORCETOC__', '__TOC__', '__NOEDITSECTION__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__', '__HIDDENCAT__', '__EXPECTUNUSEDCATEGORY__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__', '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__EXPECTUNUSEDTEMPLATE__', '__NOGLOBAL__', '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', '__ARCHIVEDTALK__', '__NOTALK__', '__EXPECTWITHOUTSCANS__'] + +# --- Helper Functions for Processing Different Wikitext Elements --- +# These functions are designed to handle specific wikitext structures. +# Some will recursively call the main `convert_to_translatable_wikitext` +# function to process their internal content, ensuring nested elements +# are also handled correctly. + +def capitalise_first_letter(text): """ - if not text.strip(): + Capitalises the first letter of the given text. + If the text is empty or consists only of whitespace, it returns the text unchanged. + """ + if not text or not text.strip(): + return text + return text[0].upper() + text[1:] + +def is_emoji_unicode(char): + # This is a very simplified set of common emoji ranges. + # A comprehensive list would be much longer and more complex. + # See https://www.unicode.org/Public/emoji/ for full details. + if 0x1F600 <= ord(char) <= 0x1F64F: # Emoticons + return True + if 0x1F300 <= ord(char) <= 0x1F5FF: # Miscellaneous Symbols and Pictographs + return True + if 0x1F680 <= ord(char) <= 0x1F6FF: # Transport and Map Symbols + return True + if 0x2600 <= ord(char) <= 0x26FF: # Miscellaneous Symbols + return True + if 0x2700 <= ord(char) <= 0x27BF: # Dingbats + return True + # Add more ranges as needed for full coverage + return False + +def _wrap_in_translate(text): + """ + Wraps the given text with tags. + It ensures that empty or whitespace-only strings are not wrapped. + The tags are added around the non-whitespace content, + preserving leading and trailing whitespace. + """ + if not text or not text.strip(): return text - if re.search(r'.*', text): + # Find the first and last non-whitespace characters + first_char_index = -1 + last_char_index = -1 + for i, char in enumerate(text): + if char not in (' ', '\n', '\t', '\r', '\f', '\v'): # Check for common whitespace characters + if first_char_index == -1: + first_char_index = i + last_char_index = i + + # If no non-whitespace characters are found (should be caught by text.strip() check, but for robustness) + if first_char_index == -1: return text - # If the text already has tags (including comments), do not add them - if translate_tag_pattern.search(text): + leading_whitespace = text[:first_char_index] + content = text[first_char_index : last_char_index + 1] + trailing_whitespace = text[last_char_index + 1 :] + + return f"{leading_whitespace}{content}{trailing_whitespace}" + +def process_syntax_highlight(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid syntax highlight tag" + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_table(text): + """ + Processes table blocks in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('{|') and text.endswith('|}')), "Invalid table tag" + return text - # If the text has any special characters, time values, or certain tags, don't wrap it in tags - if (attribute_pattern.search(text) or special_char_pattern.match(text) or - hiero_pattern.search(text) or sub_pattern.search(text) or sup_pattern.search(text) or - time_pattern.match(text) or gallery_pattern.search(text) or file_pattern.search(text) or br_pattern.search(text) or magic_word.search(text)) : # Skip time values +def process_blockquote(text): + """ + Processes blockquote tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('
') and text.endswith('
')), "Invalid blockquote tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: return text - # Wrap the entire block of text in tags - return f'{text}' - -def process_math(line): - """ - Processes math-related tags ({{math}}, , etc.) and ensures their content is not wrapped in tags. - """ - # Generalized regex for math-related tags - math_patterns = [ - re.compile(r'(\{\{math\|.*?\}\})', re.DOTALL), # For {{math}} templates - re.compile(r'(.*?)', re.DOTALL) # For tags with attributes and content - ] - - for pattern in math_patterns: - match = pattern.search(line) - if match: - math_content = match.group(0) - return line.replace(math_content, math_content) # Return math-related content as is - - return line - -def process_table_line(line): - """ - Processes a single line of a table and adds tags where necessary, - ensuring that only the actual content of table cells is wrapped, not the separators. - """ - if not line: - return line - - if line.startswith("|+"): - # For table caption - return f'{line[:2]}{add_translate_tags(line[2:].strip()) if len(line) > 2 else ""}' - elif line.startswith("|-"): - # Table row separator - return line - elif line.startswith("!"): - # For table headers, split on ! and !! without breaking words - headers = header_cell_separator_pattern.split(line) - translated_headers = [] - for header in headers: - if header in ['!', '!!']: # Preserve the ! and !! without adding translate tags - translated_headers.append(header) - else: - # Safely process header content - processed_header = header.strip() - if processed_header: - processed_header = process_external_link(processed_header) - processed_header = process_double_name_space(processed_header) - translated_headers.append(add_translate_tags(processed_header)) - return "".join(translated_headers) - else: - # For table rows, ensure content is wrapped but separators are untouched - cells = table_cell_separator_pattern.split(line) - translated_cells = [] - for cell in cells: - if cell in ['|', '||', '*']: # Leave separators as is - translated_cells.append(cell) - elif cell and cell.startswith("[["): - # Process wiki links using process_double_name_space - processed_cell = process_double_name_space(cell) - translated_cells.append(processed_cell) - elif cell and cell.startswith("http"): - # Process external links - processed_cell = process_external_link(cell) - translated_cells.append(processed_cell) - elif cell and cell.startswith("{{"): - # Process double curly braces - processed_cell = process_doublecurly(cell) - translated_cells.append(processed_cell) - elif cell: - translated_cells.append(add_translate_tags(cell.strip())) - return "".join(translated_cells) - -def process_div(line): - """ - Processes any
tag and adds tags around the text content inside the div, - while keeping the div structure and attributes intact. - """ - # Regex pattern to detect
tags - div_pattern = re.compile(r'(]*>)(.*?)(
)', re.DOTALL) - match = div_pattern.search(line) + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - if match: - opening_div_tag = match.group(1) #
- div_content = match.group(2) # Text or content inside the div - closing_div_tag = match.group(3) #
+def process_poem_tag(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid poem tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" - # Wrap only the text content inside the div with tags - translated_content = add_translate_tags(div_content.strip()) +def process_code_tag(text, tvar_code_id=0): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid code tag" + # Get inside the tag + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = f'{content}' + return f"{prefix}{wrapped_content}{suffix}" - return f'{opening_div_tag}{translated_content}{closing_div_tag}' - return line +def process_div(text): + """ + Processes
tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('')), "Invalid div tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" -def process_header(line): - match = header_pattern.match(line) - if match: - translated_header_text = add_translate_tags(match.group()) - return translated_header_text - return line +def process_hiero(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid hiero tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" +def process_sub_sup(text): + """ + Processes and tags in the wikitext. + It wraps the content in tags. + """ + assert((text.startswith('') and text.endswith('')) or + (text.startswith('') and text.endswith(''))), "Invalid sub/sup tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" -def process_double_name_space(line): +def process_math(text): """ - Double Name space (e.g., [[link/eg]]) and adds tags around the eg text. - Also handles simple internal links by adding Special:MyLanguage prefix. - Properly handles text that appears after closing brackets by adding translate tags. - Does not put translate tags around colons. + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid math tag" + return text + +def process_small_tag(text): """ - if 'Special:MyLanguage/' in line: - return line + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid small tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_nowiki(text): + """ + Processes tags in the wikitext. + It wraps the content in tags. + """ + assert(text.startswith('') and text.endswith('')), "Invalid nowiki tag" + start_tag_end = text.find('>') + 1 + end_tag_start = text.rfind('<') + if start_tag_end >= end_tag_start: + return text + prefix = text[:start_tag_end] + content = text[start_tag_end:end_tag_start].strip() + suffix = text[end_tag_start:] + if not content: + return text + # Wrap the content in tags + wrapped_content = _wrap_in_translate(content) + return f"{prefix}{wrapped_content}{suffix}" + +def process_item(text): + """ + Processes list items in the wikitext. + It wraps the content in tags. + """ + offset = 0 + if text.startswith(';'): + offset = 1 + elif text.startswith(':'): + offset = 1 + elif text.startswith('#'): + while text[offset] == '#': + offset += 1 + elif text.startswith('*'): + while text[offset] == '*': + offset += 1 + # Add translate tags around the item content + item_content = text[offset:].strip() + if not item_content: + return text + return text[:offset] + ' ' + _wrap_in_translate(item_content) + '\n' + +class double_brackets_types(Enum): + wikilink = 1 + category = 2 + inline_icon = 3 + not_inline_icon_file = 4 + special = 5 + invalid_file = 6 + +def _process_file(s, tvar_inline_icon_id=0): + # Define keywords that should NOT be translated when found as parameters + NON_TRANSLATABLE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', + 'upright', 'baseline', 'middle', 'sub', 'super', 'text-top', 'text-bottom', '{{dirstart}}', '{{dirend}}' + } + NON_TRANSLATABLE_KEYWORDS_PREFIXES = { + 'link=', 'upright=', 'alt=' + } + NOT_INLINE_KEYWORDS = { + 'left', 'right', 'centre', 'center', 'thumb', 'frameless', 'border', 'none', '{{dirstart}}', '{{dirend}}' + } + file_aliases = ['File:', 'file:', 'Image:', 'image:'] + + tokens = [] + + inner_content = s[2:-2] # Remove the leading [[ and trailing ]] + tokens = inner_content.split('|') + tokens = [token.strip() for token in tokens] # Clean up whitespace around tokens + + # The first token shall start with a file alias + # e.g., "File:Example.jpg" or "Image:Example.png" + if not tokens or not tokens[0].startswith(tuple(file_aliases)): + return line, double_brackets_types.invalid_file + + # The first token is a file link + filename = tokens[0].split(':', 1)[1] if ':' in tokens[0] else tokens[0] + tokens[0] = f'File:{filename}' + + # Substitute 'left' with {{dirstart}} + while 'left' in tokens: + tokens[tokens.index('left')] = '{{dirstart}}' + # Substitute 'right' with {{dirend}} + while 'right' in tokens: + tokens[tokens.index('right')] = '{{dirend}}' - if line.lower().startswith("[[category:"): - y = line.split(']]')[0] - if not existing_translation_pattern.search(line): - return y + '{{#translation:}}]]' + ############################ + # Managing inline icons + ############################# + is_inline_icon = True + for token in tokens: + if token in NOT_INLINE_KEYWORDS: + is_inline_icon = False + break + if is_inline_icon : + # Check if it contains 'alt=' followed by an emoji + for token in tokens[1:]: + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + if not any(is_emoji_unicode(char) for char in alt_text): + is_inline_icon = False + break + elif token not in NON_TRANSLATABLE_KEYWORDS: + is_inline_icon = False + break + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + is_inline_icon = False + break + + if is_inline_icon: + # return something like: [[File:smiley.png|alt=🙂]] + returnline = f'[[' + '|'.join(tokens) + ']]' + return returnline, double_brackets_types.inline_icon + + ############################ + # Managing general files + ############################# + + output_parts = [] + + # The first token is the file name (e.g., "File:Example.jpg") + # We substitute any occurrences of "Image:" with "File:" + output_parts.append(tokens[0]) + + pixel_regex = re.compile(r'\d+(?:x\d+)?px') # Matches pixel values like "100px" or "100x50px)" + for token in tokens[1:]: + # Check for 'alt=' + if token.startswith('alt='): + alt_text = token[len('alt='):].strip() + output_parts.append('alt='+_wrap_in_translate(alt_text)) + # Check if the token is a known non-translatable keyword + elif token in NON_TRANSLATABLE_KEYWORDS: + output_parts.append(token) + # If the token starts with a known non-translatable prefix, keep it as is + elif any(token.startswith(prefix) for prefix in NON_TRANSLATABLE_KEYWORDS_PREFIXES): + output_parts.append(token) + # If the token is a pixel value, keep it as is + elif pixel_regex.match(token): + output_parts.append(token) + # Otherwise, assume it's a caption or other translatable text else: - return y + ']]' + output_parts.append(f"{token}") + + # Reconstruct the line with the transformed parts + returnline = '[[' + '|'.join(output_parts) + ']]' + return returnline, double_brackets_types.not_inline_icon_file - # Handle File case - if len(line) > 6 and line[0:2] == "[[" and line[2:6] == "File": - # File processing logic remains the same - returnline = "" - i = 0 - while i < len(line): - if line[i:i+4] == 'alt=': - returnline += "alt=" - i += 4 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i < len(line) and line[i] == '|': - if i+1 < len(line) and line[i+1] == ' ': - if i+3 < len(line) and line[i+2:i+4] in ('left'): - returnline += line[i] - elif i+6 < len(line) and line[i+2:i+7] in ('right','center','thumb'): - returnline += line[i] - else: - returnline += "| " - i+=2 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i+2 < len(line) and line[i+1:i+3] in ('left'): - returnline += line[i] - elif i+5 < len(line) and line[i+1:i+6] in ('right','center','thumb'): - returnline += line[i] - else: - returnline += "| " - i+=1 - while i < len(line) and line[i] != '|' and line[i] != ']': - returnline += line[i] - i += 1 - returnline += "" - if i < len(line): - returnline += line[i] - else: - if i < len(line): - returnline += line[i] - i += 1 - return returnline - link_pattern = r'\[\[(.*?)\]\]' - parts = re.split(link_pattern, line) +def process_double_brackets(text, tvar_id=0): + """ + Processes internal links in the wikitext. + It wraps the content in tags. + """ + if not (text.startswith("[[") and text.endswith("]]")) : + print(f"Input >{text}< must be wrapped in double brackets [[ ]]") + sys.exit(1) + # Split the link into parts, handling both internal links and links with display text - result = "" - for i, part in enumerate(parts): - if i % 2 == 0: - if part.strip(): - colon_parts = re.split(r'(:)', part) - for j, cp in enumerate(colon_parts): - if cp == ':': - result += cp - elif cp.strip(): - result += f"{cp}" - else: - result += cp - else: - result += part - else: - if '|' in part: - link_target, link_text = part.split('|', 1) - if not link_target.startswith(('Category:', 'File:', 'Special:')): - result += f'[[Special:MyLanguage/{link_target}|{link_text}]]' - else: - result += f'[[{link_target}|{link_text}]]' - else: - if not part.startswith(('Category:', 'File:', 'Special:')): - result += f'[[Special:MyLanguage/{part}|{part}]]' - else: - result += f'[[{part}]]' + inner_wl = text[2:-2] # Remove the leading [[ and trailing ]] + parts = inner_wl.split('|') + + # part 0 + category_aliases = ['Category:', 'category:', 'Cat:', 'cat:'] + file_aliases = ['File:', 'file:', 'Image:', 'image:'] - return result -def process_external_link(line): + parts[0] = parts[0].strip() # Clean up the first part + # Check if the first part is a category or file alias + if parts[0].startswith(tuple(category_aliases)): + # Handle category links + cat_name = parts[0].split(':', 1)[1] if ':' in parts[0] else parts[0] + return f'[[Category:{cat_name}{{{{#translation:}}}}]]', double_brackets_types.category + elif parts[0].startswith(tuple(file_aliases)): + # Handle file links + return _process_file(text) + elif parts[0].startswith('Special:'): + # Handle special pages + return f'[[{parts[0]}]]', double_brackets_types.special + + # Assuming it's a regular internal link + if len(parts) == 1: + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[0]}]]', double_brackets_types.wikilink + if len(parts) == 2 : + return f'[[Special:MyLanguage/{capitalise_first_letter(parts[0])}|{parts[1]}]]', double_brackets_types.wikilink + return text + +def process_external_link(text, tvar_url_id=0): """ Processes external links in the format [http://example.com Description] and ensures that only the description part is wrapped in tags, leaving the URL untouched. """ - match = re.match(r'(\[https?://[^\s]+)\s+([^\]]+)\]', line) + match = re.match(r'\[(https?://[^\s]+)\s+([^\]]+)\]', text) if match: url_part = match.group(1) description_part = match.group(2) # Wrap only the description part in tags, leave the URL untouched - return f'{url_part} {description_part}]' - return line + return f'[{url_part} {description_part}]' + return text -def process_lists(line): - """ - Processes lists (e.g., *, #, :) by adding tags around list item content. - """ - for i in range(len(line)): - if line[i] in ['*', '#', ':', ';']: - continue - else: - words = line[i:].split("
") - for j in range(len(words)): - if "https://" in words[j]: - words[j] = f"{words[j]}" - elif "[[" in words[j]: - words[j] = process_double_name_space(words[j]) - else: - worder = words[j].split(":") - for k in range(len(worder)): - if worder[k] == '': - continue - else: - worder[k] = f"{worder[k]}" - words[j] = ":".join(worder) - - newstring = "
".join(words) - return f"{line[:i]}{newstring}" - - -def process_doublecurly(line): +def process_template(text): """ Processes the text to ensure that only the content outside of double curly braces {{ ... }} is wrapped in tags, while preserving the template content inside the braces without translating it. """ - if "{{" in line and "}}" in line: - start = line.index("{{") - end = line.index("}}") + 2 # Include the closing "}}" - inside_curly = line[start:end] - outside_curly = line[end:].strip() - if outside_curly: - return f"{line[:start]}{outside_curly}" - return inside_curly - else: - return f"{line.strip()}" - -def process_blockquote(line): - """ - Handles blockquote tags by ensuring content inside blockquote is not wrapped in tags. - """ - if "
" in line and "
" in line: - before_blockquote = line.split("
")[0].strip() - blockquote_content = line.split("
")[1].split("
")[0] - after_blockquote = line.split("
")[1].strip() - - translated_before = add_translate_tags(before_blockquote) if before_blockquote else '' - translated_after = add_translate_tags(after_blockquote) if after_blockquote else '' - - return f'{translated_before}
{blockquote_content}
{translated_after}' - elif "
" in line: - before_blockquote = line.split("
")[0].strip() - blockquote_content = line.split("
")[1].strip() - translated_before = add_translate_tags(before_blockquote) if before_blockquote else '' - return f'{translated_before}
{blockquote_content}' - elif "
" in line: - blockquote_content = line.split("
")[0].strip() - after_blockquote = line.split("
")[1].strip() - translated_after = add_translate_tags(after_blockquote) if after_blockquote else '' - return f'{blockquote_content}
{translated_after}' - else: - return line -def process_poem_tag(line, in_poem_block=False): - """ - Detects and tags and processes the text content inside the poem - by wrapping it in tags. Handles cases where only one of the tags is present. + assert(text.startswith('{{') and text.endswith('}}')), "Invalid template tag" + # Split the template content from the rest of the text + inner_content = text[2:-2].strip() # Remove the leading {{ and trailing }} + inner_content = capitalise_first_letter(inner_content) # Capitalise the first letter of the inner content - :param line: The line of text to process. - :param in_poem_block: A flag to indicate if we are already inside a block. - :return: Processed line, and updated in_poem_block flag. - """ - opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) - closing_poem_pattern = re.compile(r'()', re.IGNORECASE) - # Case 1: Detect an opening tag (without necessarily having a closing tag) - if opening_poem_pattern.search(line) and not in_poem_block: - opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag - start_idx = line.find(opening_tag) + len(opening_tag) - poem_content = line[start_idx:].strip() # Get the content after the opening tag - # If there's a closing tag within the same line - if closing_poem_pattern.search(line): - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - end_idx = line.find(closing_tag) - - poem_content = line[start_idx:end_idx].strip() # Get content between and - - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - - # Return the fully processed line - return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(closing_tag):])}', False - - else: - # If only the opening tag is present, we are in the middle of a poem block - translated_poem_content = convert_to_translatable_wikitext(poem_content) - return f'{opening_tag}{translated_poem_content}', True - - # Case 2: Detect a closing tag without an opening tag in the same line - elif closing_poem_pattern.search(line) and not in_poem_block: - - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag - print(line.find(closing_tag)) - after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - # print(after_poem) - translated_after_poem = convert_to_translatable_wikitext(after_poem) - # Return the processed line with the closing tag - return f'{translated_poem_content}{closing_tag}{after_poem}', False - - # Case 3: We are inside a block and no closing tag is found in this line - elif in_poem_block: - print(line) - translated_poem_content = convert_to_translatable_wikitext(line.strip()) - return f'{translated_poem_content}', True - - # Case 4: No tag found, return the line as is - return line, in_poem_block -def process_small_tag(line, in_poem_block=False): - """ - Detects and tags and processes the text content inside the poem - by wrapping it in tags. Handles cases where only one of the tags is present. + # If the inner content is empty, return an empty string + if not inner_content : + return text - :param line: The line of text to process. - :param in_poem_block: A flag to indicate if we are already inside a block. - :return: Processed line, and updated in_poem_block flag. - """ - opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) - closing_poem_pattern = re.compile(r'(
)', re.IGNORECASE) - # Case 1: Detect an opening tag (without necessarily having a closing tag) - if opening_poem_pattern.search(line) and not in_poem_block: - opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag - start_idx = line.find(opening_tag) + len(opening_tag) - poem_content = line[start_idx:].strip() # Get the content after the opening tag - - # If there's a closing tag within the same line - if closing_poem_pattern.search(line): - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - end_idx = line.find(closing_tag) - poem_content = line[start_idx:end_idx].strip() # Get content between and - - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - - # Return the fully processed line - return f'{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(""):])}', False + # Wrap the inner content in tags + return '{{' + inner_content + '}}' + +def process_raw_url(text): + """ + Processes raw URLs in the wikitext. + It wraps the URL in tags. + """ + # This function assumes the text is a raw URL, e.g., "http://example.com" + # and wraps it in tags. + if not text.strip(): + return text + return text.strip() + + +# --- Main Tokenisation Logic --- - else: - # If only the opening tag is present, we are in the middle of a poem block - translated_poem_content = convert_to_translatable_wikitext(poem_content) - return f'{opening_tag}{translated_poem_content}', True - - # Case 2: Detect a closing tag without an opening tag in the same line - elif closing_poem_pattern.search(line) and not in_poem_block: - - closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag - poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag - after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() - # Process the poem content by adding tags - translated_poem_content = convert_to_translatable_wikitext(poem_content) - # print(after_poem) - translated_after_poem = convert_to_translatable_wikitext(after_poem) - # Return the processed line with the closing tag - return f'{translated_poem_content}{closing_tag}{after_poem}', False - - # Case 3: We are inside a block and no closing tag is found in this line - elif in_poem_block: - translated_poem_content = convert_to_translatable_wikitext(line.strip()) - return f'{translated_poem_content}', True - - # Case 4: No tag found, return the line as is - return line, in_poem_block -# def process_big_tag(line, in_poem_block=False): -# """ -# Detects and tags and processes the text content inside the poem -# by wrapping it in tags. Handles cases where only one of the tags is present. - -# :param line: The line of text to process. -# :param in_poem_block: A flag to indicate if we are already inside a block. -# :return: Processed line, and updated in_poem_block flag. -# """ -# opening_poem_pattern = re.compile(r'(]*>)', re.IGNORECASE) -# closing_poem_pattern = re.compile(r'()', re.IGNORECASE) -# # Case 1: Detect an opening tag (without necessarily having a closing tag) -# if opening_poem_pattern.search(line) and not in_poem_block: -# opening_tag = opening_poem_pattern.search(line).group(1) # Extract the opening tag -# start_idx = line.find(opening_tag) + len(opening_tag) -# print(line[:start_idx]) -# poem_content = line[start_idx:].strip() # Get the content after the opening tag -# # If there's a closing tag within the same line -# if closing_poem_pattern.search(line): -# closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag -# end_idx = line.find(closing_tag) -# poem_content = line[start_idx:end_idx].strip() # Get content between and - -# # Process the poem content by adding tags -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# # Return the fully processed line -# return f'{convert_to_translatable_wikitext(line[:start_idx])}{opening_tag}{translated_poem_content}{closing_tag}{convert_to_translatable_wikitext(line[end_idx+len(""):])}', False - -# else: -# # If only the opening tag is present, we are in the middle of a poem block -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# return f'{opening_tag}{translated_poem_content}', True - -# # Case 2: Detect a closing tag without an opening tag in the same line -# elif closing_poem_pattern.search(line) and not in_poem_block: - -# closing_tag = closing_poem_pattern.search(line).group(1) # Extract the closing tag -# poem_content = line[:line.find(closing_tag)].strip() # Get content before the closing tag -# print(line.find(closing_tag)) -# after_poem= line[line.find(closing_tag)+len(closing_tag):].strip() -# # Process the poem content by adding tags -# translated_poem_content = convert_to_translatable_wikitext(poem_content) -# # print(after_poem) -# translated_after_poem = convert_to_translatable_wikitext(after_poem) -# # Return the processed line with the closing tag -# return f'{translated_poem_content}{closing_tag}{after_poem}', False - -# # Case 3: We are inside a block and no closing tag is found in this line -# elif in_poem_block: -# translated_poem_content = convert_to_translatable_wikitext(line.strip()) -# return f'{translated_poem_content}', True - -# # Case 4: No tag found, return the line as is -# return line, in_poem_block -def process_code_tag(line): - """ - Processes and tags and ensures that the content inside the tags is not wrapped in tags. - """ - if "" in line: - before_code = line.split("")[0].strip() - code_content = line.split("")[1].split("")[0] - after_code = line.split("")[1].strip() - - translated_before = convert_to_translatable_wikitext(before_code) if before_code else '' - translated_after = convert_to_translatable_wikitext(after_code) if after_code else '' - - return f'{translated_before}{code_content}{translated_after}' - elif "" in line: - before_code = line.split("{code_content}' - elif "" in line: - code_content = line.split("
")[0].strip() - after_code = line.split("")[1].strip() - translated_after = convert_to_translatable_wikitext(after_code) if after_code else '' - return f'{code_content}{translated_after}' - else: - return line -def process_syntax_highlights(line): - """ - Processes and tags and ensures that the content inside the tags is not wrapped in tags. - """ - if "" in line: - before_syntax = line.split("")[0].strip() - syntax_content = line.split("")[1].split("")[0] - after_syntax = line.split("")[1].strip() - - translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else '' - return f'{translated_before}{syntax_content}{after_syntax}' - elif "" in line: - before_syntax = line.split("")[1].strip() - translated_before = convert_to_translatable_wikitext(before_syntax) if before_syntax else '' - return f'{translated_before}{syntax_content}' - elif "" in line: - syntax_content = line.split("")[0].strip() - after_syntax = line.split("")[1].strip() - translated_after = convert_to_translatable_wikitext(after_syntax) if after_syntax else '' - return f'{syntax_content}{translated_after}' - else: - return line def convert_to_translatable_wikitext(wikitext): - if wikitext == "": - return "" """ - Converts standard wikitext to translatable wikitext by wrapping text with tags. - Handles tables, lists, blockquotes, divs, and ensures tags inside blockquotes are not wrapped. + Converts standard wikitext to translatable wikitext by wrapping + translatable text with tags, while preserving and + correctly handling special wikitext elements. + This function tokenizes the entire text, not line by line. """ - lines = re.split("\n",wikitext) - converted_lines = [] - in_syntax_highlight = False - in_table = False - for line in lines: - if line is not None: - line = line.strip() + if not wikitext: + return "" - - if line: - if "" in line: - # End of a syntax highlight block - closing_tag_idx = line.index("") - - # Process content before the closing tag - converted_lines.append(line[:closing_tag_idx]) + # add an extra newline at the beginning, useful to process items at the beginning of the text + wikitext = '\n' + wikitext + + parts = [] + last = 0 + curr = 0 + text_length = len(wikitext) + + while curr < text_length : + found = None + # Syntax highlight block + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_syntax_highlight)) + curr = end_pos + last = curr + continue + # Table block + pattern = '{|' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('|}', curr) + len('|}') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_table)) + curr = end_pattern + last = curr + continue + # Blockquote + pattern = '
' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('
', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_blockquote)) + curr = end_pattern + last = curr + continue + # Poem tag + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_poem_tag)) + curr = end_pattern + last = curr + continue + # Code tag + pattern = '', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_code_tag)) + curr = end_pattern + last = curr + continue + # Div tag + pattern = '', curr) + len('
') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_div)) + curr = end_pattern + last = curr + continue + # Hiero tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_hiero)) + curr = end_pattern + last = curr + continue + # Sub tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Sup tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_sub_sup)) + curr = end_pattern + last = curr + continue + # Math tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_math)) + curr = end_pattern + last = curr + continue + # Small tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_small_tag)) + curr = end_pattern + last = curr + continue + # Nowiki tag + pattern = '' + if wikitext.startswith(pattern, curr): + end_pattern = wikitext.find('', curr) + len('') + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], process_nowiki)) + curr = end_pattern + last = curr + continue + # br tag + patterns = ['
', '
', '
'] + for p in patterns: + if wikitext.startswith(p, curr): + end_pattern = curr + len(p) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pattern], lambda x: x)) + curr = end_pattern + last = curr + found = True + break + if found: + continue + # Lists + patterns_newline = ['\n*', '\n#', '\n:', '\n;'] + if any(wikitext.startswith(p, curr) for p in patterns_newline) : + curr += 1 # Discard the newline character + parts.append((wikitext[last:curr], _wrap_in_translate)) + # Iterate through the list items + patterns = ['*', '#', ':', ';'] + while any(wikitext.startswith(p, curr) for p in patterns) : + end_pattern = wikitext.find('\n', curr) + if end_pattern == -1: + end_pattern = text_length + else : + end_pattern += 1 # Include the newline in the part + parts.append((wikitext[curr:end_pattern], process_item)) + curr = end_pattern + last = curr + continue + # Internal links + pattern = '[[' + if wikitext.startswith(pattern, curr): + # Count the number of opening double brackets '[[' and closing ']]' to find the end + end_pos = curr + 2 + bracket_count = 1 + while end_pos < text_length and bracket_count > 0: + if wikitext.startswith('[[', end_pos): + bracket_count += 1 + end_pos += 2 + elif wikitext.startswith(']]', end_pos): + bracket_count -= 1 + end_pos += 2 + else: + end_pos += 1 + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + if end_pos > curr + 2: # Ensure we have a valid link + parts.append((wikitext[curr:end_pos], process_double_brackets)) + curr = end_pos + last = curr + continue + # External links + pattern = '[http' + if wikitext.startswith(pattern, curr): + # Find the end of the external link + end_pos = wikitext.find(']', curr) + if end_pos == -1: + end_pos = text_length + else : + end_pos += 1 # Include the closing ']' in the part + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos + 1], process_external_link)) + curr = end_pos + last = curr + continue + # Templates + pattern = '{{' + if wikitext.startswith(pattern, curr): + # Find the end of the template + end_pos = wikitext.find('}}', curr) + 2 + if end_pos == 1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_template)) + curr = end_pos + last = curr + continue + # Raw URLs + pattern = 'http' + if wikitext.startswith(pattern, curr): + # Find the end of the URL (space or end of string) + end_pos = wikitext.find(' ', curr) + if end_pos == -1: + end_pos = text_length + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], process_raw_url)) + curr = end_pos + last = curr + continue + # Behaviour switches + for switch in behaviour_switches: + if wikitext.startswith(switch, curr): + end_pos = curr + len(switch) + if last < curr: + parts.append((wikitext[last:curr], _wrap_in_translate)) + parts.append((wikitext[curr:end_pos], lambda x: x)) + curr = end_pos + last = curr - # Append the closing syntaxhighlight tag - converted_lines.append(line[closing_tag_idx:]) - in_syntax_highlight = False # Exiting syntax highlight mode - elif in_syntax_highlight: - # Inside a syntaxhighlight block, do not process the line - converted_lines.append(line) - elif line.startswith("'''"): - converted_lines.append(process_lists(line)) - elif line.startswith("{|"): - in_table = True - converted_lines.append(line) - elif line.startswith("|}") and in_table: - in_table = False - converted_lines.append(line) - elif in_table: - converted_lines.append(process_table_line(line)) - elif header_pattern.match(line): - converted_lines.append(process_header(line)) - elif line.startswith("http"): - converted_lines.append(line) - elif line.startswith("[["): - converted_lines.append(process_double_name_space(line)) - elif line.startswith("["): - converted_lines.append(process_external_link(line)) - elif line.startswith(""): - converted_lines.append(line) - elif line.startswith("*") or line.startswith("#") or line.startswith(":") or line.startswith(";"): - converted_lines.append(process_lists(line)) - elif line.startswith("{{"): - converted_lines.append(process_doublecurly(line)) - elif "
" in line or "
" in line: - converted_lines.append(process_blockquote(line)) - elif "" in line: - converted_lines.append(process_poem_tag(line)[0]) - elif "" in line: - converted_lines.append(process_code_tag(line)) - elif ' tag - elif '
' in line: - converted_lines.append(line) - elif "" in line or "" in line: - converted_lines.append(line) # Do not add translate tags inside tag - elif sub_pattern.search(line) or sup_pattern.search(line): - converted_lines.append(line) # Do not add translate tags inside / - elif "" in line or "{{math}}" in line: - converted_lines.append(process_math(line)) # Handle math tags - elif "" in line or "" in line: - # If the line contains tags, we won't wrap them. - converted_lines.append(process_small_tag(line)[0]) + + curr += 1 # Move to the next character if no pattern matched + + # Add any remaining text after the last processed part + if last < text_length: + parts.append((wikitext[last:], _wrap_in_translate)) + + """ + print ('*' * 20) + for i, (part, handler) in enumerate(parts): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + + print ('*' * 20) + """ + + # Process links + tvar_id = 0 + tvar_url_id = 0 + tvar_code_id = 0 + tvar_inline_icon_id = 0 + for i, (part, handler) in enumerate(parts): + # Handlers for links require a tvar_id + if handler == process_double_brackets: + new_part, double_brackets_type = handler(part, tvar_id) + if double_brackets_type in [double_brackets_types.wikilink, double_brackets_types.special, double_brackets_types.inline_icon]: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + else : + new_handler = lambda x: x # No further processing for categories and files + parts[i] = (new_part, new_handler) + tvar_id += 1 + elif handler == process_external_link: + new_part = handler(part, tvar_url_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) + tvar_url_id += 1 + elif handler == process_code_tag: + new_part = handler(part, tvar_code_id) + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + parts[i] = (new_part, new_handler) + tvar_code_id += 1 + elif handler == process_double_brackets : + new_part, double_brackets_type = handler(part, tvar_inline_icon_id) + if double_brackets_type == double_brackets_types.inline_icon: + new_handler = _wrap_in_translate # Change handler to _wrap_in_translate + tvar_inline_icon_id += 1 else: - converted_lines.append(add_translate_tags(line)) - else: - converted_lines.append('') - converted_lines = [str(line) if line is not None else "" for line in converted_lines] - return '\n'.join(converted_lines) + new_handler = lambda x: x + + # Scan again the parts: merge consecutive parts handled by _wrap_in_translate + _parts = [] + if parts: + current_part, current_handler = parts[0] + for part, handler in parts[1:]: + if handler == _wrap_in_translate and current_handler == _wrap_in_translate: + # Merge the parts + current_part += part + else: + _parts.append((current_part, current_handler)) + current_part, current_handler = part, handler + # Add the last accumulated part + _parts.append((current_part, current_handler)) + + # Process the parts with their respective handlers + processed_parts = [handler(part) for part, handler in _parts] + + # Debug output + """ + print("Processed parts:") + for i, (ppart, (part, handler)) in enumerate(zip(processed_parts, _parts)): + print(f"--- Start element {i} with handler {handler.__name__} ---") + print(part) + print(f"---\n") + print(ppart) + print(f"---\n") + """ + + # Join the processed parts into a single string + return ''.join(processed_parts)[1:] # Remove the leading newline added at the beginning @app.route('/') def index(): diff --git a/tests.py b/tests.py index 5cdde08..2112349 100644 --- a/tests.py +++ b/tests.py @@ -1,8 +1,8 @@ import unittest -from app import convert_to_translatable_wikitext, process_double_name_space +from app import convert_to_translatable_wikitext, process_double_brackets class TestTranslatableWikitext(unittest.TestCase): - + def test_section_headers(self): self.assertEqual( convert_to_translatable_wikitext("==HELLO=="), @@ -12,67 +12,181 @@ def test_section_headers(self): def test_file_tag_translations(self): self.assertEqual( convert_to_translatable_wikitext( - "[[File:landscape.jpg |thumb |left |alt=sunset |Photo of a beautiful landscape]]" + '[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]' ), - "[[File:landscape.jpg |thumb |left | alt=sunset |Photo of a beautiful landscape]]" + '[[File:landscape.jpg|thumb|{{dirstart}}|alt=sunset|Photo of a beautiful landscape]]' ) def test_internal_and_external_links(self): self.assertEqual( convert_to_translatable_wikitext( - "This is a text with an [[internal link]] and an [https://openstreetmap.org external link]." + 'This is a text with an [[internal link]] and an [https://openstreetmap.org external link].' ), - "This is a text with an [[internal link]] and an [https://openstreetmap.org external link]." + 'This is a text with an [[Special:MyLanguage/Internal link|internal link]] and an [https://openstreetmap.org external link].' ) - + def test_category_with_translation(self): self.assertEqual( convert_to_translatable_wikitext("[[Category:Wikipedia]]"), "[[Category:Wikipedia{{#translation:}}]]" ) - + def test_notoc_preserved(self): self.assertEqual( convert_to_translatable_wikitext("__NOTOC__"), "__NOTOC__" ) - + def test_simple_internal_link(self): self.assertEqual( - convert_to_translatable_wikitext("[[link]]"), - "[[Special:MyLanguage/link|link]]" + convert_to_translatable_wikitext('[[link]]'), + '[[Special:MyLanguage/Link|link]]' ) - + def test_multiline_text(self): self.assertEqual( - convert_to_translatable_wikitext(""" - hi iam charan -
- happy - """), - "\nhi iam charan\n
\nhappy\n" + convert_to_translatable_wikitext('\nhi iam charan\n
\nhappy\n\n'), + '\nhi iam charan\n
\nhappy\n\n' ) - + def test_double_namespace_processing(self): self.assertEqual( - process_double_name_space( - "[[File:pretty hello word.png|alt=Hello everybody!]], [[File:smiley.png|alt=😂]] How are you?" + convert_to_translatable_wikitext( + '[[File:pretty hello word.png | alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ), - "[[File:pretty hello word.png| alt=Hello everybody!]], [[File:smiley.png| alt=😂]] How are you?" + '[[File:pretty hello word.png|alt=Hello everybody!]] [[File:smiley.png|alt=🙂]] How are you?' ) + def test_double_namespace_without_list_case_1(self): self.assertEqual( - process_double_name_space( - "[[Help]]ing" + convert_to_translatable_wikitext( + '[[Help]]ing' ), - "[[Special:MyLanguage/Help|Help]]ing" + '[[Special:MyLanguage/Help|Help]]ing' ) + def test_double_namespace_without_list_case_2(self): self.assertEqual( - process_double_name_space( - "[[Help]] ing" + convert_to_translatable_wikitext( + '[[Help]] ing' ), - "[[Special:MyLanguage/Help|Help]] ing" + '[[Special:MyLanguage/Help|Help]] ing' + ) + + def test_template_simple(self): + self.assertEqual( + convert_to_translatable_wikitext("{{Template Name}}"), + "{{Template Name}}" + ) + + def test_template_with_parameters(self): + self.assertEqual( + convert_to_translatable_wikitext("{{Template|param1=Value 1|Value 2}}"), + "{{Template|param1=Value 1|Value 2}}" + ) + + def test_template_nested_in_text(self): + self.assertEqual( + convert_to_translatable_wikitext('Some text with {{a template here}} and more text.'), + 'Some text with {{A template here}} and more text.' + ) + + def test_nowiki_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Some text with [[Raw link]] content."), + "Some text with [[Raw link]] content." + ) + + def test_blockquote_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("
This is a quote.
"), + "
This is a quote.
" + ) + + def test_poem_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Line 1\nLine 2"), + "Line 1\nLine 2" + ) + + def test_code_tag_with_tvar(self): + # Assuming process_code_tag assigns tvar names sequentially starting from 0 + self.assertEqual( + convert_to_translatable_wikitext("Here is some code for you."), + "Here is some code for you." + ) + + def test_div_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("
Div content here.
"), + "
Div content here.
" + ) + + def test_hiero_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("hieroglyphics"), + "hieroglyphics" + ) + + def test_sub_sup_tags(self): + self.assertEqual( + convert_to_translatable_wikitext("H2O and E=mc2"), + "H2O and E=mc2" ) + + def test_math_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("x^2 + y^2 = z^2"), + "x^2 + y^2 = z^2" + ) + + def test_small_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("Small text"), + "Small text" + ) + + def test_image_with_upright(self): + self.assertEqual( + convert_to_translatable_wikitext("[[File:Example.jpg|upright=1.5|A larger image]]"), + "[[File:Example.jpg|upright=1.5|A larger image]]" + ) + + def test_multiple_elements_in_one_line(self): + self.assertEqual( + convert_to_translatable_wikitext("Hello world! [[Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]"), + 'Hello world! [[Special:MyLanguage/Link|Link]] {{Template}} [https://meta.wikimedia.org/wiki/Main_Page Home]' + ) + + def test_text_around_br_tag(self): + self.assertEqual( + convert_to_translatable_wikitext("First line.
Second line."), + "First line.
Second line." + ) + + def test_empty_string_input(self): + self.assertEqual( + convert_to_translatable_wikitext(""), + "" + ) + + def test_whitespace_only_input(self): + self.assertEqual( + convert_to_translatable_wikitext(" \n\t "), + " \n\t " + ) + + def test_list_items(self): + self.assertEqual( + convert_to_translatable_wikitext("* Item 1\n** Sub-item 1.1\n* Item 2"), + "* Item 1\n** Sub-item 1.1\n* Item 2\n" + ) + + def test_definition_list(self): + self.assertEqual( + convert_to_translatable_wikitext(";Term\n:Definition\n:Description"), + "; Term\n: Definition\n: Description\n" + ) + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main(exit=False, failfast=True)