Skip to content

Commit c041d8d

Browse files
authored
Merge pull request #4 from CocoRoF/main
feat: Add OCR engine module with support for multiple providers
2 parents ba7b6dd + b42e494 commit c041d8d

123 files changed

Lines changed: 337 additions & 336 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# libs/__init__.py
1+
# contextifier/__init__.py
22
"""
33
Contextifier Library
44
@@ -23,14 +23,14 @@
2323
result = processor.extract_chunks("document.pdf", chunk_size=1000)
2424
"""
2525

26-
__version__ = "0.1.0"
26+
__version__ = "0.1.2"
2727

2828
# Expose core classes at top level
29-
from libs.core import DocumentProcessor
29+
from contextifier.core import DocumentProcessor
3030

3131
# Explicit subpackages
32-
from libs import core
33-
from libs import chunking
32+
from contextifier import core
33+
from contextifier import chunking
3434

3535
__all__ = [
3636
"__version__",
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"""
2121

2222
# === Main Chunking Functions (chunking.py) ===
23-
from libs.chunking.chunking import (
23+
from contextifier.chunking.chunking import (
2424
# Primary API
2525
create_chunks,
2626
# Backward compatibility (deprecated)
@@ -30,7 +30,7 @@
3030
)
3131

3232
# constants
33-
from libs.chunking.constants import (
33+
from contextifier.chunking.constants import (
3434
# Constants
3535
LANGCHAIN_CODE_LANGUAGE_MAP,
3636
HTML_TABLE_PATTERN,
@@ -48,15 +48,15 @@
4848
)
4949

5050
# table_parser
51-
from libs.chunking.table_parser import (
51+
from contextifier.chunking.table_parser import (
5252
parse_html_table,
5353
extract_cell_spans,
5454
extract_cell_spans_with_positions,
5555
has_complex_spans,
5656
)
5757

5858
# table_chunker
59-
from libs.chunking.table_chunker import (
59+
from contextifier.chunking.table_chunker import (
6060
calculate_available_space,
6161
adjust_rowspan_in_chunk,
6262
build_table_chunk,
@@ -67,7 +67,7 @@
6767
)
6868

6969
# protected_regions
70-
from libs.chunking.protected_regions import (
70+
from contextifier.chunking.protected_regions import (
7171
find_protected_regions,
7272
get_protected_region_positions,
7373
ensure_protected_region_integrity,
@@ -79,15 +79,15 @@
7979
)
8080

8181
# page_chunker
82-
from libs.chunking.page_chunker import (
82+
from contextifier.chunking.page_chunker import (
8383
split_into_pages,
8484
merge_pages,
8585
get_overlap_content,
8686
chunk_by_pages,
8787
)
8888

8989
# text_chunker
90-
from libs.chunking.text_chunker import (
90+
from contextifier.chunking.text_chunker import (
9191
chunk_plain_text,
9292
chunk_text_without_tables,
9393
chunk_with_row_protection,
@@ -100,7 +100,7 @@
100100
)
101101

102102
# sheet_processor
103-
from libs.chunking.sheet_processor import (
103+
from contextifier.chunking.sheet_processor import (
104104
extract_document_metadata,
105105
prepend_metadata_to_chunks,
106106
extract_sheet_sections,
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from typing import Any, Dict, List, Optional, Union
2626

2727
# Import from individual modules
28-
from libs.chunking.constants import (
28+
from contextifier.chunking.constants import (
2929
LANGCHAIN_CODE_LANGUAGE_MAP,
3030
HTML_TABLE_PATTERN,
3131
CHART_BLOCK_PATTERN,
@@ -40,14 +40,14 @@
4040
ParsedTable,
4141
)
4242

43-
from libs.chunking.table_parser import (
43+
from contextifier.chunking.table_parser import (
4444
parse_html_table as _parse_html_table,
4545
extract_cell_spans as _extract_cell_spans,
4646
extract_cell_spans_with_positions as _extract_cell_spans_with_positions,
4747
has_complex_spans as _has_complex_spans,
4848
)
4949

50-
from libs.chunking.table_chunker import (
50+
from contextifier.chunking.table_chunker import (
5151
calculate_available_space as _calculate_available_space,
5252
adjust_rowspan_in_chunk as _adjust_rowspan_in_chunk,
5353
build_table_chunk as _build_table_chunk,
@@ -57,7 +57,7 @@
5757
chunk_large_table as _chunk_large_table,
5858
)
5959

60-
from libs.chunking.protected_regions import (
60+
from contextifier.chunking.protected_regions import (
6161
find_protected_regions as _find_protected_regions,
6262
get_protected_region_positions as _get_protected_region_positions,
6363
ensure_protected_region_integrity as _ensure_protected_region_integrity,
@@ -67,14 +67,14 @@
6767
split_large_chunk_with_table_protection as _split_large_chunk_with_table_protection,
6868
)
6969

70-
from libs.chunking.page_chunker import (
70+
from contextifier.chunking.page_chunker import (
7171
split_into_pages as _split_into_pages,
7272
merge_pages as _merge_pages,
7373
get_overlap_content as _get_overlap_content,
7474
chunk_by_pages as _chunk_by_pages,
7575
)
7676

77-
from libs.chunking.text_chunker import (
77+
from contextifier.chunking.text_chunker import (
7878
chunk_plain_text as _chunk_plain_text,
7979
chunk_text_without_tables,
8080
chunk_with_row_protection,
@@ -86,7 +86,7 @@
8686
estimate_chunks_count,
8787
)
8888

89-
from libs.chunking.sheet_processor import (
89+
from contextifier.chunking.sheet_processor import (
9090
extract_document_metadata as _extract_document_metadata,
9191
prepend_metadata_to_chunks as _prepend_metadata_to_chunks,
9292
extract_sheet_sections as _extract_sheet_sections,
@@ -120,7 +120,7 @@ def _get_page_marker_patterns(page_tag_processor: Optional[Any] = None) -> List[
120120
]
121121
# Add slide pattern if different prefix
122122
if config.slide_prefix != config.tag_prefix:
123-
from libs.core.functions.page_tag_processor import PageTagType
123+
from contextifier.core.functions.page_tag_processor import PageTagType
124124
patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
125125
return patterns
126126
else:
@@ -142,7 +142,7 @@ def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
142142
Regex pattern for sheet markers
143143
"""
144144
if page_tag_processor is not None:
145-
from libs.core.functions.page_tag_processor import PageTagType
145+
from contextifier.core.functions.page_tag_processor import PageTagType
146146
return page_tag_processor.get_pattern_string(PageTagType.SHEET)
147147
else:
148148
return r'\[Sheet:\s*([^\]]+)\]'
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
from typing import List, Tuple
1313

14-
from libs.chunking.protected_regions import (
14+
from contextifier.chunking.protected_regions import (
1515
find_protected_regions, get_protected_region_positions,
1616
ensure_protected_region_integrity, split_large_chunk_with_protected_regions
1717
)
@@ -125,7 +125,7 @@ def chunk_by_pages(
125125
]
126126
config = page_tag_processor.config
127127
if config.slide_prefix != config.tag_prefix:
128-
from libs.core.functions.page_tag_processor import PageTagType
128+
from contextifier.core.functions.page_tag_processor import PageTagType
129129
page_marker_patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
130130
else:
131131
page_marker_patterns = [

libs/chunking/protected_regions.py renamed to contextifier/chunking/protected_regions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
from typing import List, Tuple
1313

14-
from libs.chunking.constants import (
14+
from contextifier.chunking.constants import (
1515
HTML_TABLE_PATTERN, CHART_BLOCK_PATTERN, TEXTBOX_BLOCK_PATTERN,
1616
IMAGE_TAG_PATTERN, MARKDOWN_TABLE_PATTERN
1717
)
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
import re
1313
from typing import Dict, List
1414

15-
from libs.chunking.constants import (
15+
from contextifier.chunking.constants import (
1616
ParsedTable, TableRow,
1717
TABLE_WRAPPER_OVERHEAD, CHUNK_INDEX_OVERHEAD
1818
)
19-
from libs.chunking.table_parser import (
19+
from contextifier.chunking.table_parser import (
2020
parse_html_table, extract_cell_spans_with_positions, has_complex_spans
2121
)
2222

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
from typing import Dict, List, Optional, Tuple
1313

14-
from libs.chunking.constants import ParsedTable, TableRow
14+
from contextifier.chunking.constants import ParsedTable, TableRow
1515

1616
logger = logging.getLogger("document-processor")
1717

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from langchain_text_splitters import RecursiveCharacterTextSplitter
1616

17-
from libs.chunking.constants import LANGCHAIN_CODE_LANGUAGE_MAP
17+
from contextifier.chunking.constants import LANGCHAIN_CODE_LANGUAGE_MAP
1818

1919
logger = logging.getLogger("document-processor")
2020

0 commit comments

Comments
 (0)