html2markdown/html2markdown.py at main · ChiamZhang/html2markdown · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import random
import string

import requests as requests

import html2text
from bs4 import BeautifulSoup

def html_table_to_markdown(html_table):
    soup = BeautifulSoup(html_table, 'html.parser')

    tables = soup.find_all('table')

    # 遍历每个表格并替换为Markdown格式
    for table in tables:
        markdown_table = ""


        thead = table.find('thead')
        if thead:
            markdown_table += "| " + " | ".join([th.text.strip() for th in thead.find_all('th')]) + " |<br>"
            markdown_table += "| " + " | ".join(["---" for _ in thead.find_all('th')]) + " |<br>"

        tbody = table.find('tbody')
        if tbody:
            for row in tbody.find_all('tr'):
                markdown_table += "| " + " | ".join([td.text.strip() for td in row.find_all('td')]) + " |<br>"

        # 创建新的Markdown表格
        new_table = soup.new_tag('p')
        new_table.append(BeautifulSoup(markdown_table, 'html.parser'))

        # 用新的Markdown表格替换原始表格
        table.replace_with(new_table)

    return soup

def sanitize_filename(filename):
    valid_chars = {char: None for char in string.whitespace + r'\/:*?"<>|'}

    # 使用 translate 方法删除非法字符
    sanitized_filename = filename.translate(str.maketrans(valid_chars))

    return sanitized_filename


def generate_random_filename(filename):
    """生成一个随机的文件名"""
    random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
    random_str = sanitize_filename(random_str)
    return filename[:-3] + f"{random_str}.md"

def ensure_directory_exists(directory_path):
    """
    检测目录是否存在，如果不存在则创建目录
    """
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)


def write_to_file(content, filename):
    """写入内容到文件"""
    target_directory = 'md'
    ensure_directory_exists(target_directory)

    full_path = os.path.join(target_directory, filename)
    while os.path.exists(full_path):
        # 如果文件已存在，则生成新的随机文件名
        filename = generate_random_filename(filename)
        full_path = os.path.join(target_directory, filename)

    with open(full_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"File {filename} Download！")


def getHttpResponse(url, titleID=None, contentID=None, cookie_data=None):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",  # Do Not Track
        # 添加其他所需的头信息
    }
    cookies=""
    if cookie_data is not None:
        cookies = {cookie["name"]: cookie["value"] for cookie in cookie_data}

    response = requests.get(url, headers=headers, cookies=cookies)

    # 使用Beautiful Soup解析HTML
    html_content = response.text
    # soup = BeautifulSoup(html_content, 'html.parser')
    # print(html_content)
    soup=html_table_to_markdown(html_content)
    random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
    random_str = sanitize_filename(random_str)
    if titleID is not None and titleID!="":

        tittle = str(soup.find(id=titleID))

    else : tittle=random_str
    if contentID is None or contentID=="":
        html_content=str(soup)
    else :html_content = str(soup.find(id=contentID))
    # print(html_content)
    return [tittle, html_content]

    # 现在你可以使用Beautiful Soup的方法来查找和提取页面中的信息


def html2markdown(url, titleID=None, contentID=None, cookie_data=None):

    ans = getHttpResponse(url, titleID, contentID, cookie_data)

    # print(html_content)
    tittle = html2text.html2text(ans[0])
    html_content = html2text.html2text(ans[1])

    fileName = tittle[2:]
    fileName = sanitize_filename(fileName + "") + ".md"

    write_to_file(tittle + html_content, fileName)