Skip to content

Commit ff2c9fa

Browse files
committed
Add scraper for knowledge base solutions data
1 parent 7a81ffc commit ff2c9fa

5 files changed

Lines changed: 217 additions & 0 deletions

File tree

data_scraper/common/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
JIRA_COLLECTION_NAME = "rca-knowledge-base"
44
OSP_DOCS_COLLECTION_NAME = "rca-osp-docs-knowledge-base"
55
ERRATA_COLLECTION_NAME = "rca-errata"
6+
SOLUTIONS_COLLECTION_NAME = "rca-solutions"
7+
SOLUTIONS_PRODUCT_NAME = "OpenStack"
8+
SOLUTIONS_MAX_RESULTS = 9999
69
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-m3"
710
DEFAULT_JIRA_URL = "https://issues.redhat.com"
811
DEFAULT_JIRA_PROJECTS = {
@@ -15,3 +18,4 @@
1518
DEFAULT_DATE_CUTOFF = "2000-01-01T00:00:00Z"
1619
DEFAULT_NUM_SCRAPER_PROCESSES=10
1720
DEFAULT_ERRATA_PUBLIC_URL="https://access.redhat.com/errata"
21+
DEFAULT_SOLUTIONS_PUBLIC_URL="https://access.redhat.com"
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""Code for scraping Solutions data"""
2+
import logging
3+
import multiprocessing as mp
4+
import subprocess
5+
import sys
6+
7+
from typing import List, Dict, TypedDict, Any
8+
from tqdm import tqdm
9+
10+
11+
import pandas as pd
12+
13+
from data_scraper.core.scraper import Scraper
14+
from data_scraper.processors.solutions_provider import SolutionsProvider
15+
16+
17+
LOG = logging.getLogger(__name__)
18+
LOG.setLevel(logging.INFO)
19+
20+
21+
class SolutionsRecord(TypedDict):
22+
"""Represents a record extracted from Solutions"""
23+
kb_id: str
24+
kind: str
25+
topic: str
26+
url: str
27+
issue: str
28+
diagnosticsteps: str
29+
text: str
30+
components: list[str]
31+
32+
33+
class SolutionsScraper(Scraper):
34+
"""Main class for Solutions scraping and processing."""
35+
36+
def __init__(self, config: dict):
37+
super().__init__(config=config)
38+
self.config = config
39+
self.kb_provider = SolutionsProvider(
40+
self.config["solutions_url"],
41+
self.config["solutions_token"]
42+
)
43+
44+
def get_documents(self) -> list[dict]:
45+
documents = self.kb_provider.get_solutions(
46+
self.config["product_name"],
47+
self.config["max_results"])
48+
return documents
49+
50+
def get_records(self, documents: List[Dict]) -> list[SolutionsRecord]:
51+
"""Convert Solution API responses to SolutionsRecord"""
52+
solutions_records: list[SolutionsRecord] = []
53+
for raw_result in tqdm(documents, desc="Processing issues"):
54+
solutions_records.append(
55+
{
56+
"kb_id": raw_result.get('id', None),
57+
"url": raw_result.get('view_uri', ''),
58+
"topic": raw_result.get('publishedTitle', ''),
59+
"issue": ''.join(raw_result.get('issue', '')),
60+
"diagnosticsteps": ''.join(raw_result.get('solution_diagnosticsteps', 'N/A')),
61+
"text": ''.join(raw_result.get('solution_resolution', 'N/A')),
62+
"components": raw_result.get('component', []),
63+
"kind": "solution",
64+
}
65+
)
66+
67+
return solutions_records
68+
69+
def get_chunks(self, record: dict) -> list[str]:
70+
chunks = []
71+
72+
for kb_field in ["topic", "issue"]:
73+
chunks += self.text_processor.split_text(record[kb_field])
74+
75+
return chunks
76+
77+
def record_postprocessing(self, record):
78+
# Postprocessing is not required for Errata records
79+
pass
80+
81+
def cleanup_records(
82+
self, records: list, backup_path: str = "solutions_all_data.csv"
83+
) -> list:
84+
df = pd.DataFrame(records)
85+
86+
LOG.info("Records stats BEFORE cleanup: %d", df.shape[0])
87+
88+
df = df.dropna()
89+
df = df.drop_duplicates(subset=["text"])
90+
91+
LOG.info("Records stats AFTER cleanup: %d", df.shape[0])
92+
93+
LOG.info("Saving backup to: %s", backup_path)
94+
df.to_csv(backup_path)
95+
96+
return [SolutionsRecord(**row) for row in df.to_dict(orient="records")]

data_scraper/main.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from data_scraper.common import constants
77
from data_scraper.core.scraper import JiraScraper, OSPDocScraper
88
from data_scraper.core.errata_scraper import ErrataScraper
9+
from data_scraper.core.solutions_scraper import SolutionsScraper
910

1011
logging.basicConfig(
1112
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -178,3 +179,49 @@ def errata_scraper() -> None:
178179

179180
scraper = ErrataScraper(config_args)
180181
scraper.run()
182+
183+
def solutions_scraper() -> None:
184+
"""Entry point for command line execution."""
185+
parser = ArgumentParser("solutions_scraper")
186+
187+
# Required arguments
188+
parser.add_argument("--database_client_url", type=str, required=True)
189+
parser.add_argument("--llm_server_url", type=str, required=True)
190+
parser.add_argument("--llm_api_key", type=str, required=True)
191+
parser.add_argument("--database_api_key", type=str, required=True)
192+
parser.add_argument("--solutions-token", type=str, required=True)
193+
194+
# Optional arguments
195+
parser.add_argument("--solutions-url", type=str,
196+
default=constants.DEFAULT_SOLUTIONS_PUBLIC_URL)
197+
parser.add_argument("--max_results", type=int,
198+
default=constants.SOLUTIONS_MAX_RESULTS)
199+
parser.add_argument("--chunk_size", type=int,
200+
default=constants.DEFAULT_CHUNK_SIZE)
201+
parser.add_argument("--embedding_model", type=str,
202+
default=constants.DEFAULT_EMBEDDING_MODEL)
203+
parser.add_argument("--db_collection_name", type=str,
204+
default=constants.SOLUTIONS_COLLECTION_NAME)
205+
parser.add_argument("--product_name", type=str,
206+
default=constants.SOLUTIONS_PRODUCT_NAME)
207+
parser.add_argument("--recreate_collection", type=bool, default=True,
208+
help="Recreate database collection from scratch.")
209+
args = parser.parse_args()
210+
211+
config_args = {
212+
"database_client_url": args.database_client_url,
213+
"llm_server_url": args.llm_server_url,
214+
"llm_api_key": args.llm_api_key,
215+
"database_api_key": args.database_api_key,
216+
"chunk_size": args.chunk_size,
217+
"embedding_model": args.embedding_model,
218+
"db_collection_name": args.db_collection_name,
219+
"solutions_url": args.solutions_url,
220+
"solutions_token": args.solutions_token,
221+
"product_name": args.product_name,
222+
"max_results": args.max_results,
223+
"recreate_collection": args.recreate_collection,
224+
}
225+
226+
scraper = SolutionsScraper(config_args)
227+
scraper.run()
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""Client to fetch Solutions."""
2+
import logging
3+
import requests
4+
5+
LOG = logging.getLogger(__name__)
6+
LOG.setLevel(logging.INFO)
7+
8+
class SolutionsProvider:
9+
"""Provider for Solutions"""
10+
11+
def __init__(self, query_url: str, query_token: str):
12+
self.query_url = query_url
13+
self.headers = {
14+
"Content-Type": "application/json",
15+
"Authorization": f"Bearer {query_token}",
16+
}
17+
18+
def get_solutions(self, product_name: str,
19+
max_results: int,
20+
start_at: int = 0) -> tuple[list[dict], int]:
21+
"""Get solutions from Knowledge Base.
22+
23+
Gets solutions from Knowledge Base and returns list of all the solutions and number
24+
of retrieved records.
25+
26+
Args:
27+
product_name: Search for Solutions for the specific product name
28+
(e.g., product_name="*OpenStack*")
29+
max_results: Maximum number of solutions that should be retrieved
30+
start_at: Specifies a start page you want to download.
31+
"""
32+
33+
url = f"{self.query_url}/hydra/rest/search/v2/kcs"
34+
35+
query = f"fq=(documentKind:Solution AND product: *{product_name}* AND solution_resolution:*)&sort=lastModifiedDate desc"
36+
37+
payload = {
38+
"clientName": "cli",
39+
"expression": query,
40+
"q": "*",
41+
"rows": max_results,
42+
"start": start_at
43+
}
44+
45+
LOG.info("Processing Solutions request [product: %s, max_results: %d, "
46+
"start_at: %d]", query, max_results, start_at)
47+
48+
try:
49+
response = requests.post(
50+
url,
51+
json=payload,
52+
headers=self.headers,
53+
verify=False,
54+
timeout=(3.05, 180),
55+
)
56+
except requests.exceptions.Timeout:
57+
LOG.error("Request to Knowledge base %s timed out.", query)
58+
return {}
59+
except requests.exceptions.RequestException as e:
60+
LOG.error("Error fetching KB data: %s", e)
61+
return {}
62+
parsed_response = response.json()['response']
63+
LOG.info("Found %d Solution records matching the query and retrieved %d " \
64+
"of them. [query: %s, max_results: %d, start_at: %d]",
65+
parsed_response["numFound"],
66+
len(parsed_response["docs"]),
67+
query, max_results, start_at)
68+
69+
return parsed_response["docs"]

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ errata_scraper = "data_scraper.main:errata_scraper"
3838
feedback_exporter = "feedback_exporter.export_feedback:main"
3939
evaluation = "evaluation.evaluation:main"
4040
osp_doc_scraper = "data_scraper.main:osp_doc_scraper"
41+
solutions_scraper = "data_scraper.main:solutions_scraper"
4142

4243
[tool.setuptools.packages.find]
4344
include = ["data_scraper*", "feedback_exporter*"]

0 commit comments

Comments
 (0)