StructSense2/flow.py at main · sensein/StructSense2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import json
import PyPDF2
from datetime import datetime
from crewai import Agent, Task, Crew, Process, LLM

def extract_pdf_text(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

pdf_text = extract_pdf_text('test.pdf')
llm = LLM(model="groq/llama-3.3-70b-versatile")

# More detailed agents
extractor = Agent(
    role='Neuroscience Entity Extractor',
    goal='Extract ALL anatomical regions, cell types, processes from neuroscience papers',
    backstory='''You extract entities and provide:
    - Exact entity text
    - Entity label (ANATOMICAL_REGION, CELL_TYPE, PROCESS, BEHAVIOR, METHOD)
    - Full sentence containing the entity
    - Paper section (abstract, introduction, methods, results, discussion)''',
    llm=llm,
    verbose=True
)

ontology_agent = Agent(
    role='Ontology Mapper',
    goal='Map entities to precise ontology terms',
    backstory='''You map entities to ontologies with exact IDs:
    - Anatomical regions → UBERON:XXXXXXX
    - Cell types → CL:XXXXXXX
    - Processes → GO:XXXXXXX
    - Clinical terms → NCIT:CXXXXXX
    Provide both ontology_id and ontology_label.''',
    llm=llm,
    verbose=True
)

marr_agent = Agent(
    role='Marr Level Classifier',
    goal='Classify entities into Marr levels with detailed rationale',
    backstory='''You classify entities:
    L1_computational: Goals, what is computed (e.g., object recognition, threat detection)
    L2_algorithmic: How it's computed (e.g., predictive coding, place cells, Bayesian inference)
    L3_implementational: Physical substrate (e.g., hippocampus, dopamine neurons, NMDA receptors)

    Provide confidence score (0-1) and clear rationale.''',
    llm=llm,
    verbose=True
)

qa_agent = Agent(
    role='Quality Assurance',
    goal='Score extractions and provide remarks',
    backstory='You assign judge_score (0-1) and detailed remarks for each entity extraction',
    llm=llm,
    verbose=True
)

# Detailed tasks with output structure
extract_task = Task(
    description=f'''Extract ALL entities from this paper text:

{pdf_text[:5000]}

For EACH entity provide:
1. Entity text (exact as appears)
2. Label type
3. Full sentence containing it
4. Paper section

Output as JSON array of entities.''',
    expected_output='JSON array of extracted entities with all fields',
    agent=extractor
)

ontology_task = Task(
    description='''For each entity, find matching ontology term.

Output JSON with added fields:
- ontology_id (e.g., "UBERON:0000955")
- ontology_label (e.g., "brainstem")

Use exact ontology formats.''',
    expected_output='Entities with ontology mappings',
    agent=ontology_agent,
    context=[extract_task]
)

marr_task = Task(
    description='''Classify each entity by Marr level and add:
- marr_level: "L1", "L2", or "L3"
- marr_rationale: detailed explanation

Group entities into:
{
  "L1_computational": [...],
  "L2_algorithmic": [...],
  "L3_implementational": [...]
}''',
    expected_output='Entities grouped by Marr level',
    agent=marr_agent,
    context=[ontology_task]
)

qa_task = Task(
    description='''For each entity add provenance:
{
  "judge_score": [score between 0-1],
  "remarks": ["detailed comment 1", "comment 2", ...]
}

Output final JSON matching this structure exactly:
{
  "L1_computational": [entities...],
  "L2_algorithmic": [entities...],
  "L3_implementational": [
    {
      "@id": "entity:name",
      "entity": "text",
      "label": "TYPE",
      "marr_level": "L3",
      "ontology_id": "UBERON:XXXXX",
      "ontology_label": "label",
      "marr_rationale": "reason",
      "context_sentences": ["sentence"],
      "paper_location": ["section"],
      "provenance": {
        "judge_score": [1.0],
        "remarks": ["remark"]
      }
    }
  ]
}''',
    expected_output='Complete JSON-LD formatted entities',
    agent=qa_agent,
    context=[marr_task],
    output_json=dict
)

crew = Crew(
    agents=[extractor, ontology_agent, marr_agent, qa_agent],
    tasks=[extract_task, ontology_task, marr_task, qa_task],
    process=Process.sequential,
    verbose=True
)

result = crew.kickoff()

# Parse and format output
try:
    entities_data = json.loads(result.raw) if isinstance(result.raw, str) else result.raw
except:
    entities_data = {"L1_computational": [], "L2_algorithmic": [], "L3_implementational": []}

output = {
    "@context": {
        "@version": 1.1,
        "marr": "http://bbqs-project.org/ontology/marr#",
        "uberon": "http://purl.obolibrary.org/obo/UBERON_",
        "cl": "http://purl.obolibrary.org/obo/CL_",
        "go": "http://purl.obolibrary.org/obo/GO_",
        "ncit": "http://purl.obolibrary.org/obo/NCIT_",
        "prov": "http://www.w3.org/ns/prov#"
    },
    "@type": "marr:NeuroscienceExtraction",
    "marr:paper_id": "DOI:10.xxxx/xxxxx",
    "marr:paper_title": "Extracted Paper Title",
    "marr:extraction_date": datetime.now().isoformat(),
    "marr:schema_version": "2.0",
    "marr:entities_by_level": entities_data
}

with open('test_entities.jsonld', 'w') as f:
    json.dump(output, f, indent=2)

print("✓ Saved to test_entities.jsonld")