|
| 1 | +import sys |
| 2 | +import os |
| 3 | +import re |
| 4 | +from PyQt6.QtWidgets import QApplication, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QFileDialog, QProgressBar, QLabel, QMessageBox, QRadioButton, QButtonGroup, QInputDialog |
| 5 | +from PyQt6.QtCore import QThread, pyqtSignal |
| 6 | +from bs4 import BeautifulSoup, NavigableString |
| 7 | +import html2text |
| 8 | + |
| 9 | +class ConversionThread(QThread): |
| 10 | + progress = pyqtSignal(int, int) |
| 11 | + finished = pyqtSignal() |
| 12 | + error = pyqtSignal(str) |
| 13 | + |
| 14 | + def __init__(self, files, output_dir, merge): |
| 15 | + super().__init__() |
| 16 | + self.files = files |
| 17 | + self.output_dir = output_dir |
| 18 | + self.merge = merge |
| 19 | + self.h = html2text.HTML2Text() |
| 20 | + self.h.body_width = 0 |
| 21 | + self.h.unicode_snob = True |
| 22 | + self.h.ignore_links = True |
| 23 | + self.h.ignore_images = True |
| 24 | + self.h.ignore_anchors = True |
| 25 | + self.h.ignore_emphasis = False |
| 26 | + self.h.ignore_tables = False |
| 27 | + self.is_cancelled = False |
| 28 | + self.merged_content = [] |
| 29 | + |
| 30 | + def run(self): |
| 31 | + try: |
| 32 | + total_files = len(self.files) |
| 33 | + for i, file in enumerate(self.files): |
| 34 | + if self.is_cancelled: |
| 35 | + break |
| 36 | + content = self.convert_to_markdown(file) |
| 37 | + if self.merge: |
| 38 | + self.merged_content.append(content) |
| 39 | + else: |
| 40 | + output_path = os.path.join(self.output_dir, f"{os.path.splitext(os.path.basename(file))[0]}.md") |
| 41 | + with open(output_path, 'w', encoding='utf-8') as f: |
| 42 | + f.write(content) |
| 43 | + self.progress.emit(i + 1, total_files) |
| 44 | + |
| 45 | + if not self.is_cancelled: |
| 46 | + self.finished.emit() |
| 47 | + except Exception as e: |
| 48 | + self.error.emit(str(e)) |
| 49 | + |
| 50 | + def convert_to_markdown(self, file): |
| 51 | + with open(file, 'r', encoding='utf-8') as f: |
| 52 | + html_content = f.read() |
| 53 | + |
| 54 | + soup = BeautifulSoup(html_content, 'html.parser') |
| 55 | + main_content = soup.find('main') or soup.find('body') |
| 56 | + |
| 57 | + if main_content: |
| 58 | + self.preprocess_html(main_content) |
| 59 | + markdown_content = self.h.handle(str(main_content)) |
| 60 | + markdown_content = self.postprocess_markdown(markdown_content) |
| 61 | + return markdown_content |
| 62 | + return "" |
| 63 | + |
| 64 | + def preprocess_html(self, soup): |
| 65 | + for element in soup(['script', 'style', 'nav', 'footer']): |
| 66 | + element.decompose() |
| 67 | + |
| 68 | + for element in soup.find_all(class_=re.compile('unity-')): |
| 69 | + if 'unity-code-block' in element.get('class', []): |
| 70 | + element.name = 'pre' |
| 71 | + elif 'unity-note' in element.get('class', []): |
| 72 | + element.name = 'blockquote' |
| 73 | + |
| 74 | + for a in soup.find_all('a'): |
| 75 | + a.replace_with(a.text) |
| 76 | + |
| 77 | + for img in soup.find_all('img'): |
| 78 | + img.decompose() |
| 79 | + |
| 80 | + def postprocess_markdown(self, content): |
| 81 | + content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) |
| 82 | + content = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', content) |
| 83 | + content = re.sub(r'\n{3,}', '\n\n', content) |
| 84 | + content = content.strip() |
| 85 | + |
| 86 | + lines = content.split('\n') |
| 87 | + for i, line in enumerate(lines): |
| 88 | + if re.match(r'^#{1,6}\s', line): |
| 89 | + lines[i] = f"\n{line}\n" |
| 90 | + content = '\n'.join(lines) |
| 91 | + |
| 92 | + return content |
| 93 | + |
| 94 | + def cancel(self): |
| 95 | + self.is_cancelled = True |
| 96 | + |
| 97 | +class Unity2Markdown(QWidget): |
| 98 | + def __init__(self): |
| 99 | + super().__init__() |
| 100 | + self.initUI() |
| 101 | + |
| 102 | + def initUI(self): |
| 103 | + layout = QVBoxLayout() |
| 104 | + |
| 105 | + self.select_files_btn = QPushButton('Select HTML Files') |
| 106 | + self.select_files_btn.clicked.connect(self.select_files) |
| 107 | + layout.addWidget(self.select_files_btn) |
| 108 | + |
| 109 | + self.select_output_btn = QPushButton('Select Output Directory') |
| 110 | + self.select_output_btn.clicked.connect(self.select_output_dir) |
| 111 | + layout.addWidget(self.select_output_btn) |
| 112 | + |
| 113 | + merge_layout = QHBoxLayout() |
| 114 | + self.merge_group = QButtonGroup() |
| 115 | + self.merge_radio = QRadioButton('Merge into single file') |
| 116 | + self.separate_radio = QRadioButton('Create separate files') |
| 117 | + self.merge_group.addButton(self.merge_radio) |
| 118 | + self.merge_group.addButton(self.separate_radio) |
| 119 | + self.separate_radio.setChecked(True) |
| 120 | + merge_layout.addWidget(self.merge_radio) |
| 121 | + merge_layout.addWidget(self.separate_radio) |
| 122 | + layout.addLayout(merge_layout) |
| 123 | + |
| 124 | + button_layout = QHBoxLayout() |
| 125 | + self.convert_btn = QPushButton('Convert') |
| 126 | + self.convert_btn.clicked.connect(self.start_conversion) |
| 127 | + button_layout.addWidget(self.convert_btn) |
| 128 | + |
| 129 | + self.cancel_btn = QPushButton('Cancel') |
| 130 | + self.cancel_btn.clicked.connect(self.cancel_conversion) |
| 131 | + self.cancel_btn.setEnabled(False) |
| 132 | + button_layout.addWidget(self.cancel_btn) |
| 133 | + |
| 134 | + layout.addLayout(button_layout) |
| 135 | + |
| 136 | + progress_layout = QHBoxLayout() |
| 137 | + self.progress_bar = QProgressBar() |
| 138 | + self.file_counter_label = QLabel('0 / 0') |
| 139 | + progress_layout.addWidget(self.progress_bar) |
| 140 | + progress_layout.addWidget(self.file_counter_label) |
| 141 | + layout.addLayout(progress_layout) |
| 142 | + |
| 143 | + self.status_label = QLabel('Ready to convert') |
| 144 | + layout.addWidget(self.status_label) |
| 145 | + |
| 146 | + self.setLayout(layout) |
| 147 | + self.setWindowTitle('Unity2Markdown Converter v1.1') |
| 148 | + self.setGeometry(300, 300, 500, 250) |
| 149 | + |
| 150 | + def select_files(self): |
| 151 | + files, _ = QFileDialog.getOpenFileNames(self, 'Select HTML Files', '', 'HTML Files (*.html)') |
| 152 | + if files: |
| 153 | + self.files = files |
| 154 | + self.status_label.setText(f'{len(files)} files selected') |
| 155 | + |
| 156 | + def select_output_dir(self): |
| 157 | + dir = QFileDialog.getExistingDirectory(self, 'Select Output Directory') |
| 158 | + if dir: |
| 159 | + self.output_dir = dir |
| 160 | + self.status_label.setText(f'Output directory: {dir}') |
| 161 | + |
| 162 | + def start_conversion(self): |
| 163 | + if not hasattr(self, 'files') or not hasattr(self, 'output_dir'): |
| 164 | + self.status_label.setText('Please select input files and output directory') |
| 165 | + return |
| 166 | + |
| 167 | + merge = self.merge_radio.isChecked() |
| 168 | + |
| 169 | + self.conversion_thread = ConversionThread(self.files, self.output_dir, merge) |
| 170 | + self.conversion_thread.progress.connect(self.update_progress) |
| 171 | + self.conversion_thread.finished.connect(self.conversion_finished) |
| 172 | + self.conversion_thread.error.connect(self.conversion_error) |
| 173 | + self.conversion_thread.start() |
| 174 | + |
| 175 | + self.convert_btn.setEnabled(False) |
| 176 | + self.cancel_btn.setEnabled(True) |
| 177 | + self.status_label.setText('Converting...') |
| 178 | + |
| 179 | + def update_progress(self, current, total): |
| 180 | + progress_percentage = int((current / total) * 100) |
| 181 | + self.progress_bar.setValue(progress_percentage) |
| 182 | + self.file_counter_label.setText(f'{current} / {total}') |
| 183 | + |
| 184 | + def conversion_finished(self): |
| 185 | + if self.merge_radio.isChecked(): |
| 186 | + self.get_merge_filename() |
| 187 | + else: |
| 188 | + self.show_completion_popup() |
| 189 | + self.reset_application() |
| 190 | + |
| 191 | + def get_merge_filename(self): |
| 192 | + filename, ok = QInputDialog.getText(self, 'Merge Filename', 'Enter the name for the merged file:') |
| 193 | + if ok and filename: |
| 194 | + if not filename.endswith('.md'): |
| 195 | + filename += '.md' |
| 196 | + output_path = os.path.join(self.output_dir, filename) |
| 197 | + with open(output_path, 'w', encoding='utf-8') as f: |
| 198 | + f.write("\n\n---\n\n".join(self.conversion_thread.merged_content)) |
| 199 | + self.show_completion_popup() |
| 200 | + else: |
| 201 | + QMessageBox.warning(self, 'Warning', 'Merged file was not saved. Conversion cancelled.') |
| 202 | + |
| 203 | + def conversion_error(self, error_message): |
| 204 | + self.reset_application() |
| 205 | + QMessageBox.critical(self, 'Error', f"An error occurred during conversion:\n\n{error_message}") |
| 206 | + |
| 207 | + def cancel_conversion(self): |
| 208 | + if hasattr(self, 'conversion_thread'): |
| 209 | + self.conversion_thread.cancel() |
| 210 | + self.reset_application() |
| 211 | + |
| 212 | + def show_completion_popup(self): |
| 213 | + QMessageBox.information(self, 'Conversion Complete', 'The HTML to Markdown conversion has been completed successfully!') |
| 214 | + |
| 215 | + def reset_application(self): |
| 216 | + self.convert_btn.setEnabled(True) |
| 217 | + self.cancel_btn.setEnabled(False) |
| 218 | + self.progress_bar.setValue(0) |
| 219 | + self.file_counter_label.setText('0 / 0') |
| 220 | + self.status_label.setText('Ready to convert') |
| 221 | + if hasattr(self, 'files'): |
| 222 | + del self.files |
| 223 | + if hasattr(self, 'output_dir'): |
| 224 | + del self.output_dir |
| 225 | + |
| 226 | +if __name__ == '__main__': |
| 227 | + app = QApplication(sys.argv) |
| 228 | + converter = Unity2Markdown() |
| 229 | + converter.show() |
| 230 | + sys.exit(app.exec()) |
| 231 | + |
| 232 | + |
0 commit comments