-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
95 lines (79 loc) · 2.46 KB
/
config.py
File metadata and controls
95 lines (79 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Configuration management for NLP Paper Analyzer.
"""
import os
import torch
from pathlib import Path
class Config:
"""Central configuration for the NLP Paper Analyzer."""
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"
# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Phase 1: Data Processing
KAGGLE_DATASET = "jonauskis/asap-review"
# Phase 2: Embeddings
TFIDF_MAX_FEATURES = 500
W2V_VECTOR_SIZE = 100
W2V_WINDOW = 5
W2V_MIN_COUNT = 5
BERT_MODEL = "all-MiniLM-L6-v2"
GLOVE_MODEL = "glove-wiki-gigaword-100"
FASTTEXT_VECTOR_SIZE = 100
# Phase 3: Section Classification
MAX_WORDS = 20000
MAX_LEN = 300
EMBEDDING_DIM = 128
BATCH_SIZE = 32
EPOCHS = 10
VALIDATION_SPLIT = 0.2
TARGET_BALANCE_SIZE = 3000
# Expected academic sections
EXPECTED_SECTIONS = [
'Introduction',
'Related Work',
'Methodology',
'Experiments',
'Conclusion',
'Appendix'
]
# Phase 4: OCR
NOUGAT_MODEL = "facebook/nougat-base"
OCR_MAX_TOKENS = 4000
# Phase 5: Grammar Correction
GRAMMAR_MODEL = "t5-base"
GRAMMAR_MAX_INPUT_LEN = 256
GRAMMAR_MAX_TARGET_LEN = 256
GRAMMAR_LEARNING_RATE = 5e-5
GRAMMAR_MODEL_PATH = MODELS_DIR / "t5-grammar" / "best_model"
# Phase 6: Fact Checking
FACT_MODEL = "t5-small"
FACT_MAX_LEN = 512
FACT_LEARNING_RATE = 5e-5
FACT_MODEL_PATH = MODELS_DIR / "t5_fever" / "best_model"
# Phase 7: Pipeline
PIPELINE_MAX_GRAMMAR_CHARS = 500
# Phase 8: Scoring
SCORING_WEIGHTS = {
'structure': 1.0,
'section_order': 1.0,
'classification_confidence': 1.0,
'grammar_quality': 1.0,
'consistency': 1.0
}
@classmethod
def ensure_dirs(cls):
"""Create necessary directories if they don't exist."""
for dir_path in [cls.DATA_DIR, cls.MODELS_DIR, cls.OUTPUTS_DIR]:
dir_path.mkdir(parents=True, exist_ok=True)
@classmethod
def get_model_path(cls, phase: str) -> Path:
"""Get the path for a specific phase's model."""
paths = {
'grammar': cls.GRAMMAR_MODEL_PATH,
'fact_check': cls.FACT_MODEL_PATH,
}
return paths.get(phase, cls.MODELS_DIR / phase)