-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdb.py
More file actions
1098 lines (951 loc) · 41.5 KB
/
db.py
File metadata and controls
1098 lines (951 loc) · 41.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Database Engine for Google Maps Scraper
=======================================
SQLite with WAL mode for high-performance concurrent read/write operations.
Handles deduplication, job tracking, and telemetry logging.
"""
import sqlite3
import hashlib
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional, List, Dict, Any, Tuple
class DatabaseManager:
"""Manages all database operations with optimized settings for bulk scraping."""
def __init__(self, db_path: str = "maps_data.db"):
self.db_path = db_path
self.conn = sqlite3.connect(db_path)
self.conn.row_factory = sqlite3.Row # Enable dict-like access
self._optimize_connection()
self._create_base_tables()
def _optimize_connection(self):
"""Enable WAL mode and performance optimizations."""
# WAL mode: 10x faster writes, allows concurrent reads during writes
self.conn.execute("PRAGMA journal_mode=WAL;")
# Faster writes, slightly less safe on power loss
self.conn.execute("PRAGMA synchronous=NORMAL;")
# Use 64MB of RAM for cache
self.conn.execute("PRAGMA cache_size=-64000;")
# Store temp tables in memory
self.conn.execute("PRAGMA temp_store=MEMORY;")
self.conn.commit()
def _create_base_tables(self):
"""Create the core schema tables."""
# Cities table - pre-populated with US cities
self.conn.execute("""
CREATE TABLE IF NOT EXISTS cities (
rank INTEGER PRIMARY KEY,
city TEXT NOT NULL,
state TEXT NOT NULL,
population INTEGER,
full_name TEXT UNIQUE NOT NULL
);
""")
# Jobs table - track scrape progress
self.conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id INTEGER PRIMARY KEY AUTOINCREMENT,
table_name TEXT NOT NULL,
query TEXT NOT NULL,
city TEXT NOT NULL,
preset TEXT,
status TEXT DEFAULT 'PENDING',
retry_count INTEGER DEFAULT 0,
results_found INTEGER DEFAULT 0,
bandwidth_used_kb REAL DEFAULT 0,
execution_time_sec REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
error_message TEXT
);
""")
# Telemetry table - debug and performance logging
self.conn.execute("""
CREATE TABLE IF NOT EXISTS telemetry (
log_id INTEGER PRIMARY KEY AUTOINCREMENT,
job_id INTEGER,
event_type TEXT NOT NULL,
proxy_endpoint TEXT,
response_time_ms INTEGER,
status_code INTEGER,
error_details TEXT,
metadata TEXT,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY(job_id) REFERENCES jobs(job_id)
);
""")
# Zip codes table - US zip codes by state
self.conn.execute("""
CREATE TABLE IF NOT EXISTS zip_codes (
zipcode TEXT PRIMARY KEY,
state_abbr TEXT NOT NULL,
state_name TEXT NOT NULL,
city TEXT,
county TEXT
);
""")
# Migrate first so new columns exist before indexes are created
self._migrate_zip_codes_schema()
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_zip_state ON zip_codes(state_abbr);")
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_zip_population ON zip_codes(population DESC);")
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_zip_city ON zip_codes(city, state_abbr);")
# Create indexes for common queries
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);")
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_jobs_table ON jobs(table_name);")
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_cities_rank ON cities(rank);")
self.conn.commit()
def create_results_table(self, query: str, preset: str) -> str:
"""
Create a new results table for this specific scrape run.
Returns the dynamically generated table name.
Naming convention: results_{sanitized_query}_{preset}_{timestamp}
"""
# Sanitize query for table name (alphanumeric only)
safe_query = "".join(c for c in query if c.isalnum() or c == "_").replace(" ", "")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
table_name = f"results_{safe_query}_{preset}_{timestamp}"
self.conn.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
internal_id INTEGER PRIMARY KEY AUTOINCREMENT,
dedup_hash TEXT UNIQUE NOT NULL,
-- Google Identifiers
place_id TEXT,
data_id TEXT,
cid TEXT,
google_feature_id TEXT,
-- Business Info
name TEXT NOT NULL,
category TEXT,
address_text TEXT,
zip_code TEXT,
phone_number TEXT,
website_url TEXT,
-- Metrics
rating REAL,
review_count INTEGER,
rating_text_raw TEXT,
-- Status & Attributes
open_status TEXT,
hours_snippet TEXT,
service_options TEXT,
price_level TEXT,
description_snippet TEXT,
accessibility TEXT,
-- Geospatial
latitude REAL,
longitude REAL,
plus_code TEXT,
-- Media
thumbnail_url TEXT,
-- Context
search_query TEXT,
search_city TEXT,
city_name TEXT,
state TEXT,
rank_on_page INTEGER,
google_url TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
# Create indexes for this results table
self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{table_name}_city ON {table_name}(search_city);")
self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{table_name}_name ON {table_name}(name);")
self.conn.commit()
return table_name
def _migrate_zip_codes_schema(self):
"""Add new columns to zip_codes table for existing databases."""
cols = {r[1] for r in self.conn.execute("PRAGMA table_info(zip_codes)").fetchall()}
for col, typedef in [('population', 'INTEGER DEFAULT 0'), ('lat', 'REAL'), ('lng', 'REAL')]:
if col not in cols:
self.conn.execute(f"ALTER TABLE zip_codes ADD COLUMN {col} {typedef}")
self.conn.commit()
def insert_city_bulk(self, city_data: List[Tuple]) -> int:
"""
Bulk insert cities during setup.
Args:
city_data: List of tuples (rank, city, state, population, full_name)
Returns:
Number of rows inserted
"""
cursor = self.conn.executemany("""
INSERT OR IGNORE INTO cities (rank, city, state, population, full_name)
VALUES (?, ?, ?, ?, ?)
""", city_data)
self.conn.commit()
return cursor.rowcount
def insert_result(self, table_name: str, data: Dict[str, Any]) -> bool:
"""
Insert a single result with MD5 deduplication.
Deduplication key: MD5(name + phone_number)
Returns True if new record, False if duplicate.
"""
# Create deduplication hash
unique_str = f"{data.get('name', '')}{data.get('phone_number', '')}"
dedup_hash = hashlib.md5(unique_str.encode()).hexdigest()
try:
self.conn.execute(f"""
INSERT OR IGNORE INTO {table_name} (
dedup_hash, place_id, data_id, cid, google_feature_id,
name, category, address_text, zip_code, phone_number, website_url,
rating, review_count, rating_text_raw,
open_status, hours_snippet, service_options, price_level,
description_snippet, accessibility,
latitude, longitude, plus_code,
thumbnail_url, search_query, search_city, city_name, state, rank_on_page, google_url
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
dedup_hash,
data.get('place_id'),
data.get('data_id'),
data.get('cid'),
data.get('google_feature_id'),
data.get('name'),
data.get('category'),
data.get('address_text'),
data.get('zip_code'),
data.get('phone_number'),
data.get('website_url'),
data.get('rating'),
data.get('review_count'),
data.get('rating_text_raw'),
data.get('open_status'),
data.get('hours_snippet'),
data.get('service_options'),
data.get('price_level'),
data.get('description_snippet'),
data.get('accessibility'),
data.get('latitude'),
data.get('longitude'),
data.get('plus_code'),
data.get('thumbnail_url'),
data.get('search_query'),
data.get('search_city'),
data.get('city_name'),
data.get('state'),
data.get('rank_on_page'),
data.get('google_url')
))
self.conn.commit()
# Check if row was actually inserted (rowcount is 1) or ignored (0)
return self.conn.total_changes > 0
except sqlite3.IntegrityError:
return False # Duplicate
def insert_results_batch(self, table_name: str, data_list: List[Dict[str, Any]]) -> int:
"""
Batch insert multiple results.
Returns count of new (non-duplicate) records inserted.
"""
inserted = 0
for data in data_list:
if self.insert_result(table_name, data):
inserted += 1
return inserted
def get_cities_by_preset(self, preset: str) -> List[str]:
"""
Return list of 'City, State' strings based on preset.
Available presets: top_10, top_100, top_1000, top_2500
"""
limits = {
"top_10": 10,
"top_100": 100,
"top_1000": 1000,
"top_2500": 2500
}
limit = limits.get(preset)
if limit is None:
return []
cursor = self.conn.execute(
"SELECT full_name FROM cities ORDER BY rank ASC LIMIT ?",
(limit,)
)
return [row[0] for row in cursor.fetchall()]
def get_city_count(self) -> int:
"""Return total number of cities in database."""
cursor = self.conn.execute("SELECT COUNT(*) FROM cities")
return cursor.fetchone()[0]
# --- Zip Code Methods ---
def insert_zip_codes_bulk(self, zip_data: list) -> int:
"""Bulk insert zip codes.
Each item: (zipcode, state_abbr, state_name, city, county, population, lat, lng)
Backward-compatible: 5-element tuples use population=0, lat=None, lng=None.
"""
normalized = []
for item in zip_data:
if len(item) == 5:
normalized.append((*item, 0, None, None))
else:
normalized.append(item)
cursor = self.conn.executemany("""
INSERT OR IGNORE INTO zip_codes (zipcode, state_abbr, state_name, city, county, population, lat, lng)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", normalized)
self.conn.commit()
return cursor.rowcount
def get_zip_count(self) -> int:
"""Return total number of zip codes in database."""
cursor = self.conn.execute("SELECT COUNT(*) FROM zip_codes")
return cursor.fetchone()[0]
def update_zip_populations(self, pop_map: dict) -> int:
"""
Bulk-update population for zip codes.
pop_map: {zipcode: population_int}
Returns number of rows updated.
"""
updated = 0
for zipcode, pop in pop_map.items():
self.conn.execute(
"UPDATE zip_codes SET population = ? WHERE zipcode = ?",
(pop, zipcode)
)
updated += 1
self.conn.commit()
return updated
def get_zips_by_preset(self, preset: str) -> list:
"""
Return zip codes for a preset, sorted by population DESC.
If no population data is loaded, falls back to city-population ranking
via a join with the cities table.
"""
limits = {
'top_10': 10, 'top_100': 100, 'top_500': 500,
'top_1000': 1000, 'top_5000': 5000, 'top_10000': 10000, 'all': 99999
}
limit = limits.get(preset)
if limit is None:
return []
# Check whether population data has been loaded
max_pop = self.conn.execute("SELECT MAX(population) FROM zip_codes").fetchone()[0]
if max_pop and max_pop > 0:
cursor = self.conn.execute(
"SELECT zipcode FROM zip_codes ORDER BY population DESC LIMIT ?", (limit,)
)
else:
# Fallback: join zip_codes → cities on city+state, rank by city population
cursor = self.conn.execute("""
SELECT z.zipcode
FROM zip_codes z
LEFT JOIN cities c
ON lower(z.city) = lower(c.city)
AND z.state_abbr = substr(c.full_name, instr(c.full_name, ', ') + 2)
ORDER BY COALESCE(c.population, 0) DESC, z.zipcode ASC
LIMIT ?
""", (limit,))
return [row[0] for row in cursor.fetchall()]
def get_zips_by_city(self, city: str, state_abbr: str) -> list:
"""Return all zip codes for a city/state pair, sorted by population DESC."""
cursor = self.conn.execute(
"SELECT zipcode FROM zip_codes WHERE city = ? AND state_abbr = ? ORDER BY population DESC",
(city.title(), state_abbr.upper())
)
return [row[0] for row in cursor.fetchall()]
def get_zips_by_state(self, state_abbr: str) -> List[str]:
"""Return all zip codes for a state abbreviation (e.g. 'CA'), sorted by population DESC."""
cursor = self.conn.execute(
"SELECT zipcode FROM zip_codes WHERE state_abbr = ? ORDER BY population DESC",
(state_abbr.upper(),)
)
return [row[0] for row in cursor.fetchall()]
def get_zip_info(self, zipcode: str) -> dict:
"""Return city/state info for a zip code."""
row = self.conn.execute(
"SELECT zipcode, city, state_abbr, state_name, population FROM zip_codes WHERE zipcode = ?",
(zipcode,)
).fetchone()
if row:
return {'zipcode': row[0], 'city': row[1], 'state_abbr': row[2], 'state_name': row[3], 'population': row[4]}
return {}
def get_all_zips(self) -> List[str]:
"""Return all zip codes nationwide."""
cursor = self.conn.execute("SELECT zipcode FROM zip_codes ORDER BY state_abbr, zipcode")
return [row[0] for row in cursor.fetchall()]
def get_zip_states(self) -> List[Dict[str, Any]]:
"""Return list of states with their zip code counts."""
cursor = self.conn.execute("""
SELECT state_abbr, state_name, COUNT(*) as zip_count
FROM zip_codes GROUP BY state_abbr ORDER BY state_abbr
""")
return [{"abbr": r[0], "name": r[1], "count": r[2]} for r in cursor.fetchall()]
# --- Job Management ---
def create_job(self, table_name: str, query: str, city: str, preset: str = None) -> int:
"""Create a new job entry and return the job_id."""
cursor = self.conn.execute("""
INSERT INTO jobs (table_name, query, city, preset, status)
VALUES (?, ?, ?, ?, 'PENDING')
""", (table_name, query, city, preset))
self.conn.commit()
return cursor.lastrowid
def update_job_status(
self,
job_id: int,
status: str,
results_found: int = None,
execution_time_sec: float = None,
error_message: str = None,
bandwidth_used_kb: float = None
):
"""Update job status and metrics."""
updates = ["status = ?", "last_updated = CURRENT_TIMESTAMP"]
params = [status]
if results_found is not None:
updates.append("results_found = ?")
params.append(results_found)
if execution_time_sec is not None:
updates.append("execution_time_sec = ?")
params.append(execution_time_sec)
if error_message is not None:
updates.append("error_message = ?")
params.append(error_message)
if bandwidth_used_kb is not None:
updates.append("bandwidth_used_kb = ?")
params.append(bandwidth_used_kb)
params.append(job_id)
self.conn.execute(
f"UPDATE jobs SET {', '.join(updates)} WHERE job_id = ?",
params
)
self.conn.commit()
def increment_job_retry(self, job_id: int):
"""Increment retry count for a job."""
self.conn.execute(
"UPDATE jobs SET retry_count = retry_count + 1 WHERE job_id = ?",
(job_id,)
)
self.conn.commit()
def get_job_stats(self) -> Dict[str, Any]:
"""Get aggregate statistics from all jobs."""
cursor = self.conn.execute("""
SELECT
COUNT(*) as total_jobs,
SUM(CASE WHEN status = 'COMPLETED' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'FAILED' THEN 1 ELSE 0 END) as failed,
SUM(results_found) as total_results,
SUM(bandwidth_used_kb) as total_bandwidth_kb,
AVG(execution_time_sec) as avg_time_sec,
COUNT(DISTINCT city) as unique_cities,
COUNT(DISTINCT query) as unique_queries
FROM jobs
""")
row = cursor.fetchone()
return {
'total_jobs': row[0] or 0,
'completed': row[1] or 0,
'failed': row[2] or 0,
'total_results': row[3] or 0,
'total_bandwidth_kb': row[4] or 0,
'avg_time_sec': row[5] or 0,
'unique_cities': row[6] or 0,
'unique_queries': row[7] or 0
}
def get_recent_jobs(self, limit: int = 50) -> List[Dict[str, Any]]:
"""Get recent jobs with all details."""
cursor = self.conn.execute(f"""
SELECT job_id, table_name, query, city, preset, status,
retry_count, results_found, bandwidth_used_kb,
execution_time_sec, created_at, error_message
FROM jobs
ORDER BY created_at DESC
LIMIT {limit}
""")
columns = ['job_id', 'table_name', 'query', 'city', 'preset', 'status',
'retry_count', 'results_found', 'bandwidth_used_kb',
'execution_time_sec', 'created_at', 'error_message']
return [dict(zip(columns, row)) for row in cursor.fetchall()]
def get_failed_jobs(self) -> List[Dict[str, Any]]:
"""Get all failed jobs for retry."""
cursor = self.conn.execute("""
SELECT job_id, table_name, query, city, preset, retry_count, error_message
FROM jobs
WHERE status = 'FAILED'
ORDER BY created_at DESC
""")
columns = ['job_id', 'table_name', 'query', 'city', 'preset', 'retry_count', 'error_message']
return [dict(zip(columns, row)) for row in cursor.fetchall()]
def clear_jobs(self):
"""Clear all job history."""
self.conn.execute("DELETE FROM jobs")
self.conn.commit()
# --- Telemetry ---
def log_telemetry(
self,
event_type: str,
job_id: int = None,
response_time_ms: int = None,
status_code: int = None,
error_details: str = None,
metadata: str = None
):
"""Log a telemetry event for debugging."""
self.conn.execute("""
INSERT INTO telemetry (job_id, event_type, response_time_ms, status_code, error_details, metadata)
VALUES (?, ?, ?, ?, ?, ?)
""", (job_id, event_type, response_time_ms, status_code, error_details, metadata))
self.conn.commit()
# --- Utility Methods ---
def list_result_tables(self) -> List[str]:
"""List all result tables in the database."""
cursor = self.conn.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name LIKE 'results_%'
ORDER BY name DESC
""")
return [row[0] for row in cursor.fetchall()]
def get_table_row_count(self, table_name: str) -> int:
"""Get row count for a specific table."""
cursor = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}")
return cursor.fetchone()[0]
def get_table_hashes(self, table_name: str) -> set:
"""Get all dedup hashes from a table for session-level deduplication."""
try:
cursor = self.conn.execute(f"SELECT dedup_hash FROM {table_name}")
return {row[0] for row in cursor.fetchall()}
except Exception:
return set()
def export_table_to_dataframe(self, table_name: str):
"""Export a table to pandas DataFrame."""
import pandas as pd
return pd.read_sql(f"SELECT * FROM {table_name}", self.conn)
# --- Search Methods ---
def search(
self,
query: str,
tables: List[str] = None,
field: str = None,
limit: int = 100
) -> List[Dict[str, Any]]:
"""
Search for businesses across result tables.
Args:
query: Search term (supports SQL LIKE patterns with %)
tables: List of table names to search (None = all result tables)
field: Specific field to search (None = search name, phone, address, city)
limit: Maximum results to return
Returns:
List of matching records with source table info
"""
if tables is None:
tables = self.list_result_tables()
if not tables:
return []
# Add wildcards if not present
if '%' not in query:
query = f'%{query}%'
results = []
fields_to_search = [field] if field else ['name', 'phone_number', 'address_text', 'search_city']
for table_name in tables:
try:
# Build WHERE clause for multiple fields
conditions = " OR ".join([f"{f} LIKE ?" for f in fields_to_search])
params = [query] * len(fields_to_search)
cursor = self.conn.execute(f"""
SELECT *, '{table_name}' as source_table
FROM {table_name}
WHERE {conditions}
LIMIT ?
""", params + [limit])
for row in cursor:
results.append(dict(row))
if len(results) >= limit:
break
except Exception as e:
continue # Skip tables with different schema
return results[:limit]
def search_by_name(self, name: str, tables: List[str] = None, limit: int = 100) -> List[Dict]:
"""Search by business name."""
return self.search(name, tables=tables, field='name', limit=limit)
def search_by_phone(self, phone: str, tables: List[str] = None, limit: int = 100) -> List[Dict]:
"""Search by phone number."""
# Normalize phone for search (remove common formatting)
phone_normalized = phone.replace('-', '%').replace(' ', '%').replace('(', '%').replace(')', '%')
return self.search(phone_normalized, tables=tables, field='phone_number', limit=limit)
def search_by_city(self, city: str, tables: List[str] = None, limit: int = 100) -> List[Dict]:
"""Search by city."""
return self.search(city, tables=tables, field='search_city', limit=limit)
def get_table_stats(self, table_name: str) -> Dict[str, Any]:
"""Get statistics for a result table."""
try:
cursor = self.conn.execute(f"""
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT search_city) as unique_cities,
COUNT(phone_number) as with_phone,
COUNT(website_url) as with_website,
AVG(rating) as avg_rating,
MIN(scraped_at) as first_scraped,
MAX(scraped_at) as last_scraped
FROM {table_name}
""")
row = cursor.fetchone()
return dict(row) if row else {}
except Exception:
return {}
def get_all_records(self, tables: List[str] = None, limit: int = 1000) -> List[Dict]:
"""Get all records from specified tables."""
if tables is None:
tables = self.list_result_tables()
results = []
for table_name in tables:
try:
cursor = self.conn.execute(f"""
SELECT *, '{table_name}' as source_table
FROM {table_name}
LIMIT ?
""", (limit - len(results),))
results.extend([dict(row) for row in cursor])
if len(results) >= limit:
break
except Exception:
continue
return results[:limit]
# --- Cleanup & Aggregation Methods ---
def drop_empty_tables(self) -> List[str]:
"""
Drop result tables that have 0 rows.
Returns list of dropped table names.
"""
dropped = []
tables = self.list_result_tables()
for table_name in tables:
count = self.get_table_row_count(table_name)
if count == 0:
self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
dropped.append(table_name)
self.conn.commit()
return dropped
def drop_table(self, table_name: str) -> bool:
"""Drop a specific table. Returns True if dropped."""
try:
self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
self.conn.commit()
return True
except Exception:
return False
def extract_query_from_table_name(self, table_name: str) -> str:
"""
Extract the query name from a table name.
e.g., 'results_PetCremation_top100_20251215_101910' -> 'PetCremation'
"""
if not table_name.startswith('results_'):
return None
parts = table_name[8:].split('_') # Remove 'results_' prefix
if len(parts) >= 2:
# Query is everything before the preset/timestamp
# Find where preset starts (top_X, custom, or timestamp)
query_parts = []
for part in parts:
if part in ['top', 'custom'] or part.isdigit():
break
query_parts.append(part)
return ''.join(query_parts) if query_parts else parts[0]
return parts[0] if parts else None
def extract_preset_from_table_name(self, table_name: str) -> str:
"""
Extract the preset from a table name.
e.g., 'results_PetCremation_top100_20251215' -> 'top100'
"""
if 'top_10_' in table_name or '_top10_' in table_name:
return 'top_10'
elif 'top_100_' in table_name or '_top100_' in table_name:
return 'top_100'
elif 'top_1000_' in table_name or '_top1000_' in table_name:
return 'top_1000'
elif 'top_2500_' in table_name or '_top2500_' in table_name:
return 'top_2500'
elif '_custom_' in table_name:
return 'custom'
return 'unknown'
def group_tables_by_query(self) -> Dict[str, List[str]]:
"""
Group result tables by their query name.
Returns: {'PetCremation': ['results_PetCremation_top10_...', ...], ...}
"""
tables = self.list_result_tables()
groups = {}
for table_name in tables:
# Skip master and archive tables
if table_name.startswith('master_') or table_name.startswith('archive_'):
continue
query = self.extract_query_from_table_name(table_name)
if query:
if query not in groups:
groups[query] = []
groups[query].append(table_name)
return groups
def create_master_table(self, query_name: str) -> str:
"""
Create a master table for aggregated results.
Returns the master table name.
Master Table Schema includes all original fields plus:
- source_tables: JSON array of source table names
- source_presets: JSON array of presets used
- first_seen_at: Earliest scraped_at timestamp
- last_seen_at: Latest scraped_at timestamp
- seen_count: Number of times found across tables
- cities_found_in: JSON array of cities where found
- aggregated_at: When added to master table
"""
master_name = f"master_{query_name}"
self.conn.execute(f"""
CREATE TABLE IF NOT EXISTS {master_name} (
internal_id INTEGER PRIMARY KEY AUTOINCREMENT,
dedup_hash TEXT UNIQUE NOT NULL,
-- Google Identifiers
place_id TEXT,
data_id TEXT,
cid TEXT,
google_feature_id TEXT,
-- Business Info
name TEXT NOT NULL,
category TEXT,
address_text TEXT,
zip_code TEXT,
phone_number TEXT,
website_url TEXT,
-- Metrics
rating REAL,
review_count INTEGER,
rating_text_raw TEXT,
-- Status & Attributes
open_status TEXT,
hours_snippet TEXT,
service_options TEXT,
price_level TEXT,
-- Geospatial
latitude REAL,
longitude REAL,
plus_code TEXT,
-- Media
thumbnail_url TEXT,
-- Original Context
search_query TEXT,
google_url TEXT,
-- Aggregation Metadata
source_tables TEXT, -- JSON array of source table names
source_presets TEXT, -- JSON array of presets used
cities_found_in TEXT, -- JSON array of cities where found
first_seen_at TIMESTAMP, -- Earliest scraped_at
last_seen_at TIMESTAMP, -- Latest scraped_at
seen_count INTEGER DEFAULT 1,
aggregated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
# Create indexes
self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{master_name}_name ON {master_name}(name);")
self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{master_name}_phone ON {master_name}(phone_number);")
self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{master_name}_seen ON {master_name}(seen_count DESC);")
self.conn.commit()
return master_name
def aggregate_tables_to_master(self, query_name: str, source_tables: List[str]) -> Dict[str, Any]:
"""
Aggregate multiple result tables into a master table.
Returns stats: {
'master_table': str,
'tables_processed': int,
'records_aggregated': int,
'new_records': int,
'updated_records': int
}
"""
import json
master_name = self.create_master_table(query_name)
stats = {
'master_table': master_name,
'tables_processed': 0,
'records_aggregated': 0,
'new_records': 0,
'updated_records': 0
}
for table_name in source_tables:
preset = self.extract_preset_from_table_name(table_name)
try:
cursor = self.conn.execute(f"SELECT * FROM {table_name}")
columns = [desc[0] for desc in cursor.description]
for row in cursor:
record = dict(zip(columns, row))
dedup_hash = record.get('dedup_hash')
if not dedup_hash:
continue
stats['records_aggregated'] += 1
# Check if already in master
existing = self.conn.execute(
f"SELECT source_tables, source_presets, cities_found_in, first_seen_at, last_seen_at, seen_count FROM {master_name} WHERE dedup_hash = ?",
(dedup_hash,)
).fetchone()
if existing:
# Update existing record
old_tables = json.loads(existing[0] or '[]')
old_presets = json.loads(existing[1] or '[]')
old_cities = json.loads(existing[2] or '[]')
old_first = existing[3]
old_last = existing[4]
old_count = existing[5] or 1
# Add new source info
if table_name not in old_tables:
old_tables.append(table_name)
if preset not in old_presets:
old_presets.append(preset)
city = record.get('search_city')
if city and city not in old_cities:
old_cities.append(city)
scraped = record.get('scraped_at')
new_first = min(old_first, scraped) if old_first and scraped else (old_first or scraped)
new_last = max(old_last, scraped) if old_last and scraped else (old_last or scraped)
self.conn.execute(f"""
UPDATE {master_name}
SET source_tables = ?, source_presets = ?, cities_found_in = ?,
first_seen_at = ?, last_seen_at = ?, seen_count = ?,
aggregated_at = CURRENT_TIMESTAMP
WHERE dedup_hash = ?
""", (
json.dumps(old_tables),
json.dumps(old_presets),
json.dumps(old_cities),
new_first,
new_last,
old_count + 1,
dedup_hash
))
stats['updated_records'] += 1
else:
# Insert new record
city = record.get('search_city')
cities_list = [city] if city else []
self.conn.execute(f"""
INSERT INTO {master_name} (
dedup_hash, place_id, data_id, cid,
name, category, address_text, phone_number, website_url,
rating, review_count, rating_text_raw,
open_status, hours_snippet, service_options, price_level,
latitude, longitude, plus_code, thumbnail_url,
search_query, google_url,
source_tables, source_presets, cities_found_in,
first_seen_at, last_seen_at, seen_count
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1)
""", (
dedup_hash,
record.get('place_id'),
record.get('data_id'),
record.get('cid'),
record.get('name'),
record.get('category'),
record.get('address_text'),
record.get('phone_number'),
record.get('website_url'),
record.get('rating'),
record.get('review_count'),
record.get('rating_text_raw'),
record.get('open_status'),
record.get('hours_snippet'),
record.get('service_options'),
record.get('price_level'),
record.get('latitude'),
record.get('longitude'),
record.get('plus_code'),
record.get('thumbnail_url'),
record.get('search_query'),
record.get('google_url'),
json.dumps([table_name]),
json.dumps([preset]),
json.dumps(cities_list),
record.get('scraped_at'),
record.get('scraped_at')
))
stats['new_records'] += 1
stats['tables_processed'] += 1
self.conn.commit()
except Exception as e:
logger = logging.getLogger(__name__)
logger.error(f"Error aggregating {table_name}: {e}")
continue
return stats
def archive_table(self, table_name: str) -> str:
"""
Archive a table by renaming it with 'archive_' prefix.
Returns the new archive table name.
"""
if table_name.startswith('archive_') or table_name.startswith('master_'):
return table_name # Already archived or is master