Skip to content

Commit 18d1d0b

Browse files
authored
Merge pull request #3 from drunkpig/dev
feat: init project
2 parents 0d1eb54 + bb1ad4f commit 18d1d0b

21 files changed

Lines changed: 325 additions & 5 deletions

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,18 @@ cython_debug/
192192
# refer to https://docs.cursor.com/context/ignore-files
193193
.cursorignore
194194
.cursorindexingignore
195+
196+
# sphinx docs
197+
_build/
198+
199+
200+
output/
201+
**/temp.py
202+
203+
# coverage file
204+
.coverage*
205+
coverage.xml
206+
207+
208+
.env
209+
*.egg-info/

.pre-commit-config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ repos:
4040
- mdformat_frontmatter
4141
- linkify-it-py
4242
exclude: '^tests/.*/assets/'
43-
- repo: https://github.com/myint/docformatter
44-
rev: v1.3.1
45-
hooks:
46-
- id: docformatter
47-
args: [ "--in-place", "--wrap-descriptions", "119" ]
43+
# - repo: https://github.com/myint/docformatter
44+
# rev: v1.3.1
45+
# hooks:
46+
# - id: docformatter
47+
# args: [ "--in-place", "--wrap-descriptions", "119" ]
4848
- repo: local
4949
hooks:
5050
- id: clear-jupyter-notebook-output

docs/.gitkeep

Whitespace-only changes.

realcrawl/__init__.py

Whitespace-only changes.

realcrawl/cfg.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""统一的配置读取函数
2+
配置文件位于
3+
1. 环境变量 REAL_CRAWL_CONFIG_PATH
4+
2. ~/.realcrawl/.realcrawl.jsonc
5+
"""
6+
7+
import os
8+
9+
import commentjson as json
10+
from loguru import logger
11+
12+
from realcrawl.exception.base import ConfigFileNotFoundException
13+
14+
15+
def load_config(suppress_error: bool = False) -> dict:
16+
"""Load the configuration file for the web kit. First try to read the
17+
configuration file from the environment variable REAL_CRAWL_CONFIG_PATH. If
18+
the environment variable is not set, use the default configuration file
19+
path ~/.realcrawl/.realcrawl.jsonc. If the configuration file does not exist, raise
20+
an exception.
21+
22+
Raises:
23+
ConfigFileNotFoundException: REAL_CRAWL_CONFIG_PATH points to a non-exist file
24+
ConfigFileNotFoundException: cfg_path does not exist
25+
26+
Returns:
27+
config(dict): The configuration dictionary
28+
"""
29+
# 首先从环境变量LLM_WEB_KIT_CFG_PATH 读取配置文件的位置
30+
# 如果没有配置,就使用默认的配置文件位置
31+
# 如果配置文件不存在,就抛出异常
32+
env_cfg_path = os.getenv('REAL_CRAWL_CONFIG_PATH')
33+
if env_cfg_path:
34+
cfg_path = env_cfg_path
35+
if not os.path.exists(cfg_path):
36+
if suppress_error:
37+
return {}
38+
39+
logger.warning(
40+
f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}'
41+
)
42+
raise ConfigFileNotFoundException(
43+
f'environment variable REAL_CRAWL_CONFIG_PATH points to a non-exist file: {cfg_path}'
44+
)
45+
else:
46+
cfg_path = os.path.expanduser('~/.realcrawl/.realcrawl.jsonc')
47+
if not os.path.exists(cfg_path):
48+
if suppress_error:
49+
return {}
50+
51+
logger.warning(
52+
f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path'
53+
)
54+
raise ConfigFileNotFoundException(
55+
f'{cfg_path} does not exist, please create one or set environment variable REAL_CRAWL_CONFIG_PATH to a valid file path'
56+
)
57+
58+
# 读取配置文件
59+
with open(cfg_path, 'r', encoding='utf-8') as f:
60+
config = json.load(f)
61+
62+
return config

realcrawl/crawl/cli/__init__.py

Whitespace-only changes.

realcrawl/exception/__init__.py

Whitespace-only changes.

realcrawl/exception/base.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import inspect
2+
from pathlib import Path
3+
4+
import commentjson as json
5+
6+
7+
class ErrorMsg:
8+
"""Error message manager class."""
9+
_errors = {}
10+
11+
@classmethod
12+
def _load_errors(cls):
13+
"""Load error codes and messages from JSON file."""
14+
exception_defs_file_path = Path(__file__).parent / 'exception.jsonc'
15+
with open(exception_defs_file_path, 'r', encoding='utf-8') as file:
16+
jso = json.load(file)
17+
for module, module_defs in jso.items():
18+
for err_name, err_info in module_defs.items():
19+
err_code = err_info['code']
20+
cls._errors[str(err_code)] = {
21+
'message': err_info['message'],
22+
'module': module,
23+
'error_name': err_name,
24+
}
25+
26+
@classmethod
27+
def get_error_message(cls, error_code: int):
28+
# 根据错误代码获取错误消息
29+
if str(error_code) not in cls._errors:
30+
return f'unknown error code {error_code}'
31+
return cls._errors[str(error_code)]['message']
32+
33+
@classmethod
34+
def get_error_code(cls, module: str, error_name: str) -> int:
35+
"""根据模块名和错误名获取错误代码."""
36+
for code, info in cls._errors.items():
37+
if info['module'] == module and info['error_name'] == error_name:
38+
return int(code)
39+
raise ValueError(f'error code not found: module={module}, error_name={error_name}')
40+
41+
42+
ErrorMsg._load_errors()
43+
44+
45+
class RealCrawlBaseException(Exception):
46+
"""Base exception class for realcrawl."""
47+
48+
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
49+
if error_code is None:
50+
error_code = ErrorMsg.get_error_code('realcrawlBase', 'realcrawlBaseException')
51+
52+
self.error_code = error_code
53+
self.message = ErrorMsg.get_error_message(self.error_code)
54+
self.custom_message = custom_message
55+
self.dataset_name = ''
56+
super().__init__(self.message)
57+
frame = inspect.currentframe().f_back
58+
self.__py_filename = frame.f_code.co_filename
59+
self.__py_file_line_number = frame.f_lineno
60+
61+
def __str__(self):
62+
return (
63+
f'{self.__py_filename}: {self.__py_file_line_number}#{self.error_code}#{self.message}#{self.custom_message}'
64+
)
65+
66+
67+
##############################################################################
68+
#
69+
# Config Exceptions
70+
#
71+
##############################################################################
72+
73+
class ConfigBaseException(RealCrawlBaseException):
74+
"""Base exception class for Config."""
75+
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
76+
if error_code is None:
77+
error_code = ErrorMsg.get_error_code('Config', 'ConfigBaseException')
78+
super().__init__(custom_message, error_code)
79+
80+
81+
class ConfigFileNotFoundException(ConfigBaseException):
82+
"""Config file not found exception."""
83+
def __init__(self, custom_message: str | None = None, error_code: int | None = None):
84+
if error_code is None:
85+
error_code = ErrorMsg.get_error_code('Config', 'ConfigFileNotFoundException')
86+
super().__init__(custom_message, error_code)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
// Base基础异常 (10000000)
3+
"realcrawlBase": {
4+
"realcrawlBaseException": {
5+
"code": 10000000,
6+
"message": "realcrawl base exception"
7+
}
8+
},
9+
10+
// 配置相关异常 (20000000)
11+
"Config": {
12+
"ConfigBaseException": {
13+
"code": 20000000,
14+
"message": "Config base exception"
15+
},
16+
"ConfigFileNotFoundException": {
17+
"code": 21000000,
18+
"message": "Config file not found exception"
19+
}
20+
}
21+
}

realcrawl/extract/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)