tools subpackage

Morgan243 · Morgan243 · commit 445e26ef4255 · 2025-04-27T17:03:23.000-04:00
diff --git a/mmz/agents/tools/__init__.py b/mmz/agents/tools/__init__.py
diff --git a/mmz/agents/tools/with_guidance.py b/mmz/agents/tools/with_guidance.py
@@ -0,0 +1,224 @@
+
+from dataclasses import dataclass, field
+from guidance import models, gen, block
+from typing import ClassVar, Optional
+import pydantic
+from pydantic import create_model
+from functools import cached_property
+import json
+import os
+
+import guidance
+from guidance import one_or_more, select, zero_or_more
+from simple_parsing import Serializable
+import numpy as np
+
+from mmz.agents.with_guidance import GuidanceLlamaCppConfig
+from mmz.agents import tools as mzt
+
+
+# stateless=True indicates this function does not depend on LLM generations
+@guidance(stateless=True)
+def reference_selection(lm, n_total_refs, selection_name='selected_ixes'):
+    nums = [str(ii) for ii in range(n_total_refs)]
+    return lm + one_or_more(select(nums, name=selection_name))
+
+
+@guidance(stateless=True)
+def operator(lm):
+    return lm + select(['+' , '*', '**', '/', '-'])
+
+
+def get_summary_relevance_prompt(query: str, summaries: list[dict]) -> str:
+    import json
+    from datetime import datetime
+    current_time = datetime.now().strftime("%Y-%m-%d")
+    shuffled_ixes = list(range(len(summaries)))
+    np.random.shuffle(shuffled_ixes)
+    print("shuffled_ixes:", str(shuffled_ixes))
+    prompt = f"""Analyze these search results and provide a ranked list of the most relevant ones.
+
+IMPORTANT: Evaluate and rank based on these criteria (in order of importance):
+1. Timeliness - current/recent information as of {current_time}
+2. Direct relevance to query: "{query}"
+3. Source reliability (prefer official sources, established websites)
+4. Factual accuracy (cross-reference major claims)
+
+Search results to evaluate:
+  {json.dumps([{'reference index': ii,
+                'title': s['title'],
+                'summary': s['summary']}
+                for ii, s in enumerate(summaries)]
+              , indent=2)}
+
+Return ONLY a JSON array of the 0-based reference index, ranked from most to least relevant.
+Include ONLY indices that meet ALL criteria, with the most relevant first.
+You should list all {len(summaries)} indices in your response.
+You should not output any number larger than {len(summaries) - 1}
+Respond with ONLY the JSON array, no other text."""
+#Example response (yours should be different!): {shuffled_ixes}
+    return prompt
+
+def get_summary_relevance_scalar_prompt(query: str, summary: dict):
+    import json
+    from datetime import datetime
+    current_time = datetime.now().strftime("%Y-%m-%d")
+    prompt = f"""Analyze these search results and provide a number
+between 0 and 100 according to its relevance to the users query,
+100 being the most relevant and likley answers the query
+0 being the least relevant and does not answer the query
+
+IMPORTANT: Evaluate and estimate relevance based on these criteria (in order of importance):
+1. Timeliness - current/recent information as of {current_time}
+2. Direct relevance to query: "{query}"
+
+Search results to evaluate:
+  {json.dumps({'title': summary['title'],
+               'summary': summary['summary']}
+              , indent=2)}
+
+Respond only with a number with in 0 and 100 and nothing else: """
+    return prompt
+
+@guidance
+def relevance_by_regex(llm, query, summaries):
+    relevance_prompt = get_summary_relevance_prompt(query, summaries=summaries)
+    out = llm + relevance_prompt + '[ ' + gen(regex=r'\d+') + ']'
+    return out
+
+
+@guidance
+def relevance_by_selection(llm, query, summaries, selection_name='selected_ixes'):
+    #print(f"Got summaries in relevance selection:\n{summaries}")
+    relevance_prompt = get_summary_relevance_prompt(query, summaries=summaries)
+    out = (llm + relevance_prompt
+        + '[ ' + reference_selection(n_total_refs=len(summaries),
+                                     selection_name=selection_name) + ']')
+    #print("Output produced")
+    return out
+
+
+def get_list_of_int_grammar(name="integers"):
+    from pydantic import create_model
+    schema = create_model(f"list_of_{name}", **{name: list[int]})
+    #class ListOfString(pydantic.BaseModel):
+    #    indices: list[str]
+    json_list = guidance.json(name=name, schema=schema)
+    return json_list
+
+
+def get_list_additional_topics_prompt(query: str) -> str:
+    from datetime import datetime
+    #datetime.now(tz='EST')
+    #t = datetime.now().strftime()
+    t = str(datetime.now())
+    prompt = f"""The local time is {t}"""
+    prompt += """Given the users query, produce a JSON list of other topics related to their query.\n"""
+    prompt += f"""Here is their query: {query}\n"""
+    prompt += """Provide a list of JSON strings of related topics: """
+    return prompt
+
+
+def get_list_of_str_grammar(name="strings"):
+    schema = create_model(f"list_of_{name}", **{name: list[str]})
+    json_list = guidance.json(name=name, schema=schema)
+    return json_list
+
+
+def get_q_and_a_grammar(name='answer'):
+    schema = create_model(f"{name}", **{name: str, 'confidence': int})
+    json_qa = guidance.json(name=name, schema=schema)
+    return json_qa
+
+
+@guidance
+def select_next(choices):
+    pass
+
+
+@guidance
+def relevance_by_json_int_list(llm, query, summaries, name='selected_ixes'):
+    relevance_prompt = get_summary_relevance_prompt(query, summaries=summaries)
+    #class ListOfIntegers(pydantic.BaseModel):
+    #    indices: list[int]
+    #json_list = guidance.json("selected_ixes", schema=ListOfIntegers)
+    #return llm + relevance_prompt + json_list
+    return llm + relevance_prompt + get_list_of_int_grammar(name=name)
+
+
+@guidance
+def relevance_scalar(llm, query, summary, name='relevance_magnitude'):
+    from pydantic import create_model
+    schema = create_model(f"scalar_{name}", **{name: int})
+    relevance_prompt = get_summary_relevance_scalar_prompt(query, summary=summary)
+    return llm  + relevance_prompt + guidance.json(name=name, schema=schema)
+
+
+@dataclass
+class GuidanceGuide(Serializable):
+    model_preset: Optional[str] = 'med'
+
+    model_config: Optional[GuidanceLlamaCppConfig] = None
+
+    def __post_init__(self):
+        if self.model_config is None:
+            self.model_config = GuidanceLlamaCppConfig.get_preset(self.model_preset)
+
+    @property
+    def model(self) -> models.Model:
+        return self.model_config.model
+
+    def get_relevant_ixes_from_summary(self, user_q: str,
+                                       summaries: list[dict],
+                                       relevance_grammar: callable = relevance_by_selection,
+                                       as_list: bool = True):
+        res = self.model + relevance_grammar(user_q, summaries=summaries)
+        res = res['selected_ixes']
+        if as_list:
+            if relevance_grammar == relevance_by_selection:
+                res = json.loads(f"[{res}]")
+            else:
+                res = json.loads(res['selected_ixes'])['selected_ixes']
+        return res
+
+    def filter_to_relevant_summeries(self, user_q:str, 
+                                     summaries: list[dict],
+                                     relevance_grammar: callable = relevance_by_selection) -> list[dict]:
+        ixes_to_keep = self.get_relevant_ixes_from_summary(
+            user_q=user_q,
+            summaries=summaries,
+            relevance_grammar=relevance_grammar
+        )
+        return [summaries[i] for i in ixes_to_keep]
+
+    def get_relevance_score(self, user_q:str, summary: dict) -> int:
+        res = self.model + relevance_scalar(query=user_q,
+                                            summary=summary)
+        res = json.loads(res['relevance_magnitude'])['relevance_magnitude']
+        return int(res)
+
+    def expand_topic_grammar(self, user_q: str):
+        return (self.model
+                + get_list_additional_topics_prompt(query=user_q)
+                + get_list_of_str_grammar(name='topics'))
+
+    def expand_topics(self, user_q: str,
+                      as_list: bool = True,
+                      deduplicate_list: bool = True
+                      ) -> str | list[str]:
+        # ** First ['topics'] access is to guidance to get that prompts raw results
+        res = self.expand_topic_grammar(user_q=user_q)['topics']
+        if as_list:
+            # ** Next access ['topics'] is to access the value at the 'topics' key to
+            # get the list of topics from the deserialized json
+            topic_l = json.loads(res)['topics']
+            topic_l = list(set(topic_l)) if deduplicate_list else topic_l
+            return topic_l
+        else:
+            return res
+        #return json.loads(res)['topics'] if as_list else res
+
+    def answer_query(self, user_q: str, content):
+        return
+
+
diff --git a/mmz/datasets/cvelist5.py b/mmz/datasets/cvelist5.py
@@ -0,0 +1,169 @@
+import os
+import pandas as pd
+from dataclasses import dataclass, field
+from typing import List, Optional, Any, Tuple
+from simple_parsing import Serializable
+from functools import cached_property
+import subprocess
+import json
+from mmz.utils import run_subprocess
+
+
+@dataclass
+class ExploitDB(Serializable):
+    root_directory: str
+    search_sploit_path: str = field(default=None, init=None)
+
+    def __post_init__(self):
+        self.search_sploit_path = os.path.join(self.root_directory, 'searchsploit')
+
+    def list_exploits(self) -> List[str]:
+        """List all exploits in the ExploitDB directory."""
+        import os
+        return [f for f in os.listdir(self.root_directory) if os.path.isfile(os.path.join(self.root_directory, f))]
+
+    @cached_property
+    def help_string(self):
+        """Get the help string from searchsploit."""
+        command = [self.search_sploit_path, '-h']
+        output, error = run_subprocess(command)
+        if error:
+            return f"Error: {error}"
+        else:
+            return output
+
+    def print_help(self) -> str:
+        """Invoke searchsploit with --help argument using popen."""
+        print(self.help_string)
+        return self.help_string
+
+    def searchsploit(self, *args: str) -> str:
+        command = [self.search_sploit_path] + list(args)
+        output, _ = run_subprocess(command)
+        return output
+
+    def searchsploit_as_json(self, *args: str,
+                             deserialize_results: bool = True) -> str | Any:
+        if not any(_j in args for _j in ['-j', '--json']):
+            args = list(['-j'] + list(args))
+        o = self.searchsploit(*args)
+        o = json.loads(o) if deserialize_results else o
+        return o
+
+    @staticmethod
+    def flatten_cve_results(results) -> list[dict]:
+        """
+            Flattens the given CVE results to a simple list of dictionaries.
+            
+            Args:
+                results (dict): A dictionary containing CVE search results.
+                
+            Returns:
+                list: A flattened list of dictionaries with high-level summaries of each CVE.
+        """
+        flattened_results = []
+
+        # Flatten RESULTS_EXPLOIT
+        for item in results['RESULTS_EXPLOIT']:
+            flattened_item = {
+                'Title': item['Title'],
+                'EDB-ID': item['EDB-ID'],
+                'Date_Published': item['Date_Published'],
+                'Author': item['Author'],
+                'Type': item['Type'],
+                'Platform': item['Platform'],
+                'Verified': item['Verified'],
+                'Source': 'exploit-db:exploit'
+            }
+            flattened_results.append(flattened_item)
+
+        # Flatten RESULTS_SHELLCODE
+        for item in results['RESULTS_SHELLCODE']:
+            flattened_item = {
+                'Title': item['Title'],
+                'EDB-ID': item['EDB-ID'],
+                'Date_Published': item['Date_Published'],
+                'Author': item['Author'],
+                'Type': item['Type'],
+                'Platform': item['Platform'],
+                'Verified': item['Verified'],
+                'Source': 'exploit-db:shellcode'
+            }
+            flattened_results.append(flattened_item)
+
+        return flattened_results
+
+    def searchsploit_as_summary(self, *args: str,
+                                fields: list[str] = None,
+                                n: int = None) -> str:
+        fields = ['index', 'Title'] if fields is None else fields
+        json_dat = self.searchsploit_as_json(*args)
+        df = pd.DataFrame(self.flatten_cve_results(json_dat))
+        _df = df.reset_index()[fields]
+        _df = _df if n is None else _df.head(n)
+        return _df.to_json()
+
+
+import guidance
+from guidance import capture, Tool
+from pydantic import create_model
+from guidance import regex
+
+@guidance(stateless=True)
+def searchsploit_call(lm):
+    #schema = create_model(f"{name}", **{name: str, 'confidence': int})
+    #json_qa = guidance.json(name=name, schema=schema)
+    words_rx = regex(r'\w+')
+    words_rx.match('foo bar')
+
+    # capture just 'names' the expression, to be saved in the LM state
+    return lm + 'searchsploit(' + capture(words_rx, 'tool_args') + ')'
+
+
+@guidance
+def searchsploit(lm):
+    search_terms = lm['tool_args']
+    # You typically don't want to run eval directly for security reasons
+    # Here we are guaranteed to only have mathematical expressions
+    #lm += f' = {eval(search_terms)}'
+    #db = ExploitDB()
+    db = ExploitDB(root_directory="/home/morgan/Projects/EXTERNAL/exploitdb/")
+    lm += f' = {db.searchsploit_as_summary(*search_terms, n=10)}'
+    return lm
+
+
+def test_guide():
+    #from mmz.agents.with_guidance import GuidanceLlamaCppConfig
+    from mmz.agents.tools.with_guidance import GuidanceGuide
+    from guidance import gen
+    gg = GuidanceGuide()
+    searchsploit_tool = Tool(searchsploit_call(), searchsploit)
+    few_shot = 'You are on a linux 6.0 system, write a brief report about the vulnerabilities from CVEs'
+    lm = gg.model + few_shot + gen(max_tokens=1000,
+                                   tools=[searchsploit_tool],
+                                   stop=')')
+    print(lm)
+
+test_guide()
+
+#exploit_db = ExploitDB(root_directory="/home/morgan/Projects/EXTERNAL/exploitdb/")
+#t = exploit_db.searchsploit_as_summary('linux', 'password')
+#len(t)
+#
+#
+##json_dat = exploit_db.searchsploit_as_json('dell')
+#json_dat = exploit_db.searchsploit_as_json('linux', 'password')
+#df = pd.DataFrame(exploit_db.flatten_cve_results(json_dat))
+#df.reset_index()[['index', 'Title']].to_json()
+#df.info()
+#json_dat.keys()
+#{k: type(v) for k, v in json_dat.items()}
+#{k: v if isinstance(v, str) else v[:5] for k, v in json_dat.items()}
+
+#type(raw_json['SEARCH'])
+#exploit_db.print_help()
+
+#if __name__ == "__main__":
+    # Assuming root_directory is set to the correct path
+#    exploit_db = ExploitDB(root_directory="/home/morgan/Projects/EXTERNAL/exploitdb/")
+#    exploit_db.print_help()