Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions common/utils/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import os
import re
import time
import ast
from common.core.prompt import get_related_keyword_prompt, paper_compression_prompt, extract_entity_prompt, extract_tec_entities_prompt, review_mechanism_prompt
from common.utils.llm_api import call_with_deepseek, call_with_deepseek_jsonout, call_with_qwenmax
from common.utils.arxiv_api import search_paper
from common.utils.scholar_download import download_all_pdfs
from common.utils.pdf_to_md import pdf2md_mineruapi
from common.utils.wiki_search import get_description, search
from common.core.config import OUTPUT_PATH, get_graph

def SearchKeyWordScore(Keywords):
for index, keyword in enumerate(Keywords):
entity = keyword['entity']
query = """
MATCH (n:Words)
WHERE n.other CONTAINS $entity OR n.name = $entity
RETURN n.count,n
ORDER BY n.count DESC
LIMIT 1
"""
nodes = []
g = get_graph()
if g:
try:
nodes = g.run(query, entity=entity).data()
except Exception:
nodes = []
if len(nodes) != 0:
Keywords[index]['count'] = nodes[0]['n.count']
else:
Keywords[index]['count'] = 0
min_count = min(item['count'] for item in Keywords) if Keywords else 0
max_count = max(item['count'] for item in Keywords) if Keywords else 1
weight_importance = 0.4
weight_count = 0.6
for item in Keywords:
if max_count == min_count:
normalized_count = 0.5
else:
normalized_count = (item['count'] - min_count) / (max_count - min_count)
composite_score = (item['importance_score'] * weight_importance) + (normalized_count * weight_count)
item['composite_score'] = composite_score
sorted_data = sorted(Keywords, key=lambda x: x['composite_score'], reverse=True)
return sorted_data

def get_related_keyword(Keyword):
user_prompt = get_related_keyword_prompt(Keyword=Keyword)
result = call_with_deepseek(system_prompt="You are a helpful assistant.", question=user_prompt)
return ast.literal_eval(result)

def remove_number_prefix(paragraph):
pattern = r'^\d+\. |(?<=\n)\d+\. '
modified_paragraph = re.sub(pattern, '', paragraph, flags=re.MULTILINE)
return modified_paragraph

def read_markdown_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
except Exception:
return ""

def extract_hypothesis(file, split_section="Hypothesis"):
text = read_markdown_file(file)
pattern = re.compile(fr"{split_section} \d+:(.+?)\n", re.DOTALL)
matches = pattern.findall(text)
hypotheses = [match.strip() for match in matches]
return hypotheses

def paper_compression(doi, title, topic, user_id, task):
paper_pdf_path = download_all_pdfs(dois=doi, title=title, topic=topic, user_id=user_id, task=task)
task_id = getattr(task, 'request', {}).get('id', 'default_task_id') if task else 'default_task_id'
file_path_prefix = fr"{OUTPUT_PATH}/{user_id}/{task_id}/{topic}/Paper"
directory = fr"{file_path_prefix}/compression"
batch_id = pdf2md_mineruapi(file_path=paper_pdf_path, topic=topic, user_id=user_id, task=task)
if batch_id == 0:
time.sleep(10)
batch_id = pdf2md_mineruapi(file_path=paper_pdf_path, topic=topic, user_id=user_id, task=task)
pattern = r'#{0,}\s*References.*'
if batch_id != 0:
paper_content = read_markdown_file(file_path=fr"{file_path_prefix}/markdown/{batch_id}.md")
paper_content = re.sub(pattern, '', paper_content, flags=re.DOTALL | re.IGNORECASE)
else:
return 'None'
system_prompt = paper_compression_prompt()
compression_result = call_with_qwenmax(question=f"The content is '''{paper_content}'''", system_prompt=system_prompt)
if not os.path.exists(directory):
os.makedirs(directory)
with open(fr"{directory}/{batch_id}.md", 'w', encoding='utf-8') as f:
f.write(compression_result)
return compression_result

def search_releated_paper(topic, max_paper_num=5, compression=True, user_id="", task=None):
keynum = 0
relatedPaper = []
Entities = []
papers = search_paper(Keywords=[topic], Limit=max_paper_num)
for paper in papers:
if compression:
try:
compression_result = paper_compression(doi=paper["doi"], title=paper["title"], topic=topic, user_id=user_id, task=task)
except Exception:
compression_result = "None"
else:
compression_result = "None"
try:
relatedPaper.append({"title": paper["title"], "abstract": paper["abstract"], "compression_result": compression_result})
except Exception:
continue
paper_keywords = paper.get("keyword")
if paper_keywords:
try:
for keyword in paper_keywords:
Entities.append(keyword)
except Exception:
pass
else:
try:
jr = call_with_deepseek_jsonout(system_prompt=extract_entity_prompt(), question=paper.get("abstract", ""))
extracted = jr.get("keywords", []) if jr else []
for kw in extracted:
Entities.append(kw)
except Exception:
pass
try:
json_result = call_with_deepseek_jsonout(system_prompt=extract_entity_prompt(), question=f"""The content is: {Entities}.""")
Keywords = json_result.get('keywords', []) if json_result else []
except Exception:
Keywords = []
keyword_str = ""
for keyword in Keywords:
keynum += 1
temp = get_description(search(query=keyword))
if not temp:
keyword_str += f"{keyword};\n"
else:
keyword_str += f"{keyword}:{temp[0]};\n"
return keynum, relatedPaper, keyword_str

def extract_message(file, split_section):
text = read_markdown_file(file)
if not split_section:
raise ValueError("split_section must be non-empty")
match = re.search(fr'### \S*{split_section}\S*(.*?)(?=###|---|\Z)', text, re.DOTALL)
if not match:
raise ValueError(f"Section '{split_section}' not found in file: {file}")
problem_statement = match.group(1).strip()
return text, problem_statement

def extract_technical_entities(file, split_section):
text, problem_statement = extract_message(file, split_section)
system_prompt = extract_tec_entities_prompt()
Keywords = call_with_deepseek_jsonout(system_prompt=system_prompt, question=f'The content is: {problem_statement}')['keywords']
sorted_entities = SearchKeyWordScore(Keywords)
return sorted_entities, text

def extract_message_review(file, split_section):
text = read_markdown_file(file)
if not split_section:
raise ValueError("split_section must be non-empty")
match = re.search(fr'(#.*{split_section}\**\:*)(.*?)(?=#|\Z|---)', text, re.DOTALL)
if not match:
raise ValueError(f"Section '{split_section}' not found in file: {file}")
problem_statement = match.group(2).strip()
if split_section == "Iterative Optimization Search Keywords":
question = f"""Based on the content provided below, extract the next optimization search keywords. Return the
result only in the following JSON format. Do not add any explanations or irrelevant information. JSON output
format: {{ "optimization_keyword": "xxx", ... }}

Content to extract:
'''
#{split_section}\n{problem_statement}
'''"""
elif split_section == "Next Steps for Optimization":
question = f"""Based on the content provided below, extract the next optimization goal. Return the result
only in the following JSON format. Do not add any explanations or irrelevant information. JSON output format:
{{ "optimization_goal": "xxx", ... }}

Content to extract:
'''
#{split_section}\n{problem_statement}
'''"""
else:
question = problem_statement
result = call_with_deepseek_jsonout(question=question, system_prompt="")
return text, result

def review_mechanism(topic, draft="", user_id="", task=None):
system_prompt = review_mechanism_prompt()
user_prompt = f"""# Idea Draft\n{draft}"""
result = call_with_qwenmax(system_prompt=system_prompt, question=user_prompt)
task_id = getattr(task, 'request', {}).get('id', 'default_task_id') if task else 'default_task_id'
file_path_prefix = fr"{OUTPUT_PATH}/{user_id}/{task_id}/{topic}/Review"
os.makedirs(file_path_prefix, exist_ok=True)
with open(fr"{file_path_prefix}/{topic}_review.md", 'w', encoding='utf-8') as f:
f.write(result)
text, optimize_messages = extract_message_review(file=fr"{file_path_prefix}/{topic}_review.md", split_section="Iterative Optimization Search Keywords")
keywords = []
for optimize_message in optimize_messages.values():
keywords.append({'keyword': optimize_message})
return keywords

def extract_message_review_moa(file, split_section):
text = read_markdown_file(file)
if not split_section:
raise ValueError("split_section must be non-empty")
match = re.search(fr'(#.*{split_section}\**\:*)(.*?)(?=#|\Z|---)', text, re.DOTALL)
if not match:
raise ValueError(f"Section '{split_section}' not found in file: {file}")
problem_statement = match.group(2).strip().split('\n')
return text, problem_statement
2 changes: 1 addition & 1 deletion skills/academic_workflow/scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import uuid
from scispark_ms_skills.common.workflow import Initial_Idea, Technical_Optimization, MoA_Based_Optimization, Human_AI_Collaboration
from common.workflow import Initial_Idea, Technical_Optimization, MoA_Based_Optimization, Human_AI_Collaboration

class SimpleTask:
def __init__(self, task_id=None):
Expand Down
Loading