-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_html_simple.py
More file actions
95 lines (75 loc) · 3.15 KB
/
extract_html_simple.py
File metadata and controls
95 lines (75 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""
Simple script to extract HTML from the website and parse all data
"""
import requests
from bs4 import BeautifulSoup
import json
def extract_all_data_from_html():
"""Extract all data from the website HTML"""
print("Fetching website HTML...")
url = "https://landrecords.karnataka.gov.in/service3/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
print("Parsing HTML...")
soup = BeautifulSoup(response.content, 'html.parser')
# Save raw HTML
with open('website-html.html', 'w', encoding='utf-8') as f:
f.write(response.text)
print("✅ Saved raw HTML to website-html.html")
# Extract districts
district_select = soup.find('select', {'name': 'ddl_district'})
districts = []
if district_select:
for option in district_select.find_all('option'):
value = option.get('value', '')
label = option.text.strip()
if value and value not in ['0', 'All']:
districts.append({
'value': value,
'label': label
})
print(f"Found {len(districts)} districts in HTML")
# The HTML might have all the data embedded, let's check for any data structures
# Look for script tags with data
scripts = soup.find_all('script')
data_found = False
for script in scripts:
if script.string:
# Look for JSON data or arrays in scripts
content = script.string
if 'district' in content.lower() or 'taluk' in content.lower():
print("Found potential data in script tags")
# Save script content for inspection
with open('scripts-content.txt', 'w', encoding='utf-8') as f:
f.write(content)
data_found = True
# Also check for hidden inputs or data attributes
hidden_inputs = soup.find_all('input', type='hidden')
print(f"Found {len(hidden_inputs)} hidden inputs")
# Look for any tables with data
tables = soup.find_all('table')
print(f"Found {len(tables)} tables in HTML")
# Save districts data
output = {
'districts': districts,
'note': 'This is initial HTML data. Full data may require form submissions to get taluks/hoblis/villages'
}
with open('html-extracted-data.json', 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n✅ Extracted {len(districts)} districts")
print("✅ Saved HTML to website-html.html")
print("✅ Saved extracted data to html-extracted-data.json")
print("\nNote: Full data (taluks, hoblis, villages) requires form submissions.")
print("The HTML contains the initial page structure only.")
return output
if __name__ == "__main__":
try:
extract_all_data_from_html()
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()