subtool/test_japanese_detection.py at main · Halffd/subtool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# Test script for Japanese content detection in subtitle files

import os
import sys
import re
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger("JapaneseDetectionTest")

# Add src directory to path
src_path = Path(__file__).resolve().parent / "src"
sys.path.insert(0, str(src_path))

try:
    from utils.pattern_guesser import PatternGuesser
except ImportError:
    logger.error("Failed to import PatternGuesser module. Make sure you're running from the project root.")
    sys.exit(1)

def test_file(file_path):
    """Test a specific file for Japanese content detection."""
    if not os.path.exists(file_path):
        logger.error(f"File not found: {file_path}")
        return

    try:
        # Create a PatternGuesser instance
        guesser = PatternGuesser(logger)

        # Read the file
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test if it has significant Japanese content
        is_japanese = guesser._is_japanese_content(content)

        # Output results
        logger.info(f"File: {file_path}")
        logger.info(f"Contains significant Japanese content (>30%): {is_japanese}")

        # If verbose mode, show character counts
        if len(sys.argv) > 2 and sys.argv[2].lower() == 'verbose':
            # Process the content exactly as in the pattern guesser
            # Remove SRT timestamps, numbers, and common symbols
            cleaned_content = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+', '', content)
            cleaned_content = re.sub(r'^\d+$', '', cleaned_content, flags=re.MULTILINE)

            # Remove HTML tags
            cleaned_content = re.sub(r'<[^>]+>', '', cleaned_content)

            # Keep only actual text lines
            text_lines = [line for line in cleaned_content.split('\n')
                         if line.strip() and not line.strip().isdigit()]

            if not text_lines:
                logger.info("No valid text lines found in file")
                return

            text_content = '\n'.join(text_lines)

            # Count characters
            total_chars = 0
            japanese_chars = 0

            for char in text_content:
                # Skip whitespace and punctuation
                if char.isspace() or char in '.,:;?!()[]{}"\'':
                    continue

                total_chars += 1

                # Check for Japanese character ranges
                # Hiragana (3040-309F), Katakana (30A0-30FF), CJK Unified Ideographs (4E00-9FFF)
                if '\u3040' <= char <= '\u309F' or '\u30A0' <= char <= '\u30FF' or '\u4E00' <= char <= '\u9FFF':
                    japanese_chars += 1

            japanese_percentage = 0 if total_chars == 0 else (japanese_chars / total_chars) * 100
            logger.info(f"Total characters: {total_chars}")
            logger.info(f"Japanese characters: {japanese_chars}")
            logger.info(f"Japanese percentage: {japanese_percentage:.2f}%")
            logger.info(f"Decision threshold: >30%")

            # Show sample of Japanese characters found
            if japanese_chars > 0:
                japanese_samples = [
                    char for char in text_content if
                    '\u3040' <= char <= '\u309F' or
                    '\u30A0' <= char <= '\u30FF' or
                    '\u4E00' <= char <= '\u9FFF'
                ]
                sample = ''.join(japanese_samples[:30])
                if len(sample) > 0:
                    logger.info(f"Sample Japanese characters: {sample}...")

    except Exception as e:
        logger.error(f"Error analyzing file: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())

def main():
    """Run a test of the Japanese content detection on a specified file."""
    # Check if a file is provided as an argument
    if len(sys.argv) > 1:
        file_path = sys.argv[1]
        test_file(file_path)
    else:
        logger.error("Please provide a subtitle file path as an argument")
        logger.info("Usage: python test_japanese_detection.py path/to/subtitle.srt [verbose]")
        logger.info("Add 'verbose' for detailed character analysis")

if __name__ == "__main__":
    main()