Weekly-reports-bot/smart_bot.py at main · Human-Augment-Analytics/Weekly-reports-bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# smart_bot.py
import os
import re
import pdfplumber

# =========================
# Configuration
# =========================
REPORTS_DIR = os.path.join(os.path.dirname(__file__), "Weekly Progress Reports")

HEADERS = {
    "completed": [
        "What did you do this week?",
        "What have you done this week?"
    ],
    "next": [
        "What are you going to do next week?",
        "Next week"
    ],
    "blockers": [
        "Blockers",
        "Blockers, things you want to flag, problems, etc."
    ]
}

STOP_HEADERS = [
    "Abstracts",
    "Abstract",
    "References",
    "Reference Article",
    "Summary",
    "What did you do and prove it",
    "Proof of Progress",
    "Visualization Validation"
]

ALL_HEADERS = sum(HEADERS.values(), [])

SECTION_ICONS = {
    "Completed": "✅",
    "In Progress": "🔄",
    "Blockers": "⚠️"
}

# =========================
# PDF Text Extraction
# =========================
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            pages = [p.extract_text() for p in pdf.pages if p.extract_text()]
            return "\n".join(pages)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# =========================
# Section Extraction
# =========================
def extract_section(text, header_keywords):
    for header in header_keywords:
        stop_headers = [h for h in (ALL_HEADERS + STOP_HEADERS) if h.lower() != header.lower()]
        stop_pattern = "|".join(r"^\s*" + re.escape(h) for h in stop_headers)
        pattern = r"^\s*" + re.escape(header) + r"\s*(.*?)" + r"(?=(" + stop_pattern + r")|\Z)"
        match = re.search(pattern, text, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
        if not match:
            continue

        content = match.group(1).strip()
        lines = []

        for line in content.split("\n"):
            line = line.strip()
            # Skip empty or leftover header lines / email / week info
            if not line or re.search(r"things you want to flag|Week \d+|@\w+", line, re.I):
                continue
            # Remove bullets
            line = re.sub(r"^[•\-●]\s*", "", line)
            # Collapse internal spaces
            line = re.sub(r"\s+", " ", line)
            lines.append(line)

        # Smart merging
        merged = ""
        for l in lines:
            if merged:
                if merged[-1] in ".!?":
                    merged += " " + l
                else:
                    merged += ". " + l
            else:
                merged = l

        merged = merged.strip()
        if merged and merged[-1] not in ".!?":
            merged += "."

        return merged or "None"

    return "None"

# =========================
# Report Parsing
# =========================
def parse_report(file_path):
    text = extract_text_from_pdf(file_path)
    completed = extract_section(text, HEADERS["completed"])
    next_week = extract_section(text, HEADERS["next"])
    blockers = extract_section(text, HEADERS["blockers"])
    return completed, next_week, blockers

# =========================
# Latest Week Selection
# =========================
def get_latest_week_files():
    latest_week = -1
    week_files = []
    for file in os.listdir(REPORTS_DIR):
        match = re.search(r"Week(\d+)", file)
        if not match:
            continue
        week_num = int(match.group(1))
        if week_num > latest_week:
            latest_week = week_num
            week_files = [file]
        elif week_num == latest_week:
            week_files.append(file)
    return latest_week, week_files

# =========================
# Build Smart Summary
# =========================
def build_summary(latest_week, files):
    summary = f"*Week {latest_week} Progress Summary*\n\n"
    sections = {"Completed": [], "In Progress": [], "Blockers": []}

    for file in files:
        name_match = re.search(r"Week\d+-(.*)\.pdf", file)
        name = name_match.group(1) if name_match else file
        completed, next_week, blockers = parse_report(os.path.join(REPORTS_DIR, file))

        # Full updates for Completed and In Progress
        sections["Completed"].append(f"- *{name}*: {completed}")
        sections["In Progress"].append(f"- *{name}*: {next_week}")

        # Only include meaningful blockers
        blockers_clean = blockers.strip().lower()
        if not re.fullmatch(r"(none|no|na|n/a|no blockers)?\.?", blockers_clean):
            sections["Blockers"].append(f"- *{name}*: {blockers}")

    for section, items in sections.items():
        summary += f"{SECTION_ICONS.get(section,'')} {section}:\n"
        if items:
            summary += "\n".join(items)
        else:
            # Optional: indicate no blockers at all
            if section == "Blockers":
                summary += "• None"
            else:
                summary += "• None reported"
        summary += "\n\n"

    return summary.strip()