scraper/server.py at main · Minchen202/scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#!/usr/bin/env python3
# server.py

import os
import re
import json
import asyncio
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional
import aiohttp
from aiohttp import web
import aiofiles
import subprocess
from dotenv import load_dotenv, set_key

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class WebArchiveServer:
    def __init__(self, port=8080, scraped_data_dir='./scraped_data', archives_dir='./archives'):
        self.port = port
        self.scraped_data_dir = Path(scraped_data_dir)
        self.archives_dir = Path(archives_dir)
        self.app = web.Application()
        self.active_scrape = None
        self.setup_routes()

    def setup_routes(self):
        """Setup server routes"""
        # API routes
        self.app.router.add_get('/api/runs', self.get_runs)
        self.app.router.add_get('/api/run/{run_id}', self.get_run_details)
        self.app.router.add_get('/api/run/{run_id}/pages', self.get_run_pages)
        self.app.router.add_get('/api/run/{run_id}/page/{page_hash}', self.get_page_content)
        self.app.router.add_get('/api/run/{run_id}/preview/{page_hash}', self.preview_page)
        self.app.router.add_get('/api/run/{run_id}/stats', self.get_run_stats)
        self.app.router.add_get('/api/archives', self.get_archives)

        # Global search
        self.app.router.add_get('/api/search', self.global_search)

        # Scraping controls
        self.app.router.add_post('/api/scrape/start', self.start_scrape)
        self.app.router.add_get('/api/scrape/status', self.get_scrape_status)
        self.app.router.add_post('/api/scrape/stop', self.stop_scrape)
        self.app.router.add_get('/api/scrape/config', self.get_scrape_config)

        # Add a route for the root URL to serve index.html
        self.app.router.add_get('/', self.serve_index)

        # Static files
        self.app.router.add_static('/', path='public', name='static')

    async def serve_index(self, request):
        """Serve the index.html file."""
        return web.FileResponse('public/index.html')

    async def global_search(self, request):
        """Search across all runs"""
        query = request.query.get('q', '').lower()
        limit = int(request.query.get('limit', 100))

        if not query:
            return web.json_response({'results': []})

        results = []

        if self.scraped_data_dir.exists():
            for run_dir in self.scraped_data_dir.iterdir():
                if run_dir.is_dir() and run_dir.name.replace('_', '').isdigit():
                    metadata_file = run_dir / 'metadata.json'
                    if metadata_file.exists():
                        with open(metadata_file, 'r') as f:
                            metadata = json.load(f)

                        # Search in pages
                        for url, page_data in metadata.get('pages', {}).items():
                            if query in url.lower() or query in page_data.get('domain', '').lower():
                                results.append({
                                    'run_id': run_dir.name,
                                    'url': url,
                                    'hash': self.get_url_hash(url),
                                    'domain': page_data.get('domain', ''),
                                    'timestamp': page_data.get('timestamp', ''),
                                    'content_type': page_data.get('content_type', ''),
                                    'size': page_data.get('size', 0)
                                })

                                if len(results) >= limit:
                                    break

                if len(results) >= limit:
                    break

        return web.json_response({
            'results': results[:limit],
            'total': len(results),
            'query': query
        })

    async def preview_page(self, request):
        """Preview page as rendered HTML"""
        run_id = request.match_info['run_id']
        page_hash = request.match_info['page_hash']

        # Sanitize inputs to prevent path traversal
        if not re.match(r'^[a-zA-Z0-9_]+$', run_id) or not re.match(r'^[a-zA-Z0-9]+$', page_hash):
            return web.Response(text="Invalid request parameters", status=400)

        run_dir = self.scraped_data_dir / run_id

        # Try to find the HTML file
        for ext in ['.html', '.htm', '.txt']:
            html_file = run_dir / 'html' / f"{page_hash}{ext}"

            # Security check to ensure the path is within the data directory
            if not html_file.resolve().is_relative_to(self.scraped_data_dir.resolve()):
                return web.Response(text="Forbidden", status=403)

            if html_file.exists():
                async with aiofiles.open(html_file, 'r', encoding='utf-8') as f:
                    content = await f.read()

                # Set base tag for relative URLs to work
                if '<head>' in content:
                    # Insert base tag to help with asset loading
                    base_path = f"/static/{run_id}/"
                    base_tag = f'<base href="{base_path}">'
                    content = content.replace('<head>', f'<head>{base_tag}')

                # Return as HTML response for direct rendering
                return web.Response(text=content, content_type='text/html')

        # If not found, return a proper 404 page
        error_html = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>Page Not Found</title>
            <style>
                body {
                    font-family: -apple-system, BlinkMacSystemFont, sans-serif;
                    display: flex;
                    align-items: center;
                    justify-content: center;
                    height: 100vh;
                    margin: 0;
                    background: #f3f4f6;
                }
                .error-container {
                    text-align: center;
                    padding: 2rem;
                    background: white;
                    border-radius: 0.5rem;
                    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
                }
                h1 { color: #ef4444; }
                p { color: #6b7280; }
            </style>
        </head>
        <body>
            <div class="error-container">
                <h1>404 - Page Not Found</h1>
                <p>The requested page could not be found in the archive.</p>
                <p>It may not have been scraped or could have been filtered out.</p>
            </div>
        </body>
        </html>
        """
        return web.Response(text=error_html, content_type='text/html', status=404)

    async def start_scrape(self, request):
        """Start a new scraping job"""
        if self.active_scrape:
            return web.json_response({
                'error': 'A scrape is already in progress'
            }, status=400)

        try:
            data = await request.json()

            # Update .env file with new configuration
            env_file = '.env'
            for key, value in data.items():
                if key in ['START_URL', 'MAX_WORKERS', 'MAX_DEPTH', 'MAX_PAGES',
                          'PAGES_PER_DOMAIN', 'IMAGE_QUALITY', 'MAX_IMAGE_WIDTH',
                          'COMPRESSION_LEVEL', 'SKIP_ASSETS', 'RESPECT_ROBOTS_TXT',
                          'REQUEST_DELAY']:
                    set_key(env_file, key, str(value))

            # Start the scraping process
            self.active_scrape = await asyncio.create_subprocess_exec(
                'python', 'main.py',
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            # Start monitoring task
            asyncio.create_task(self.monitor_scrape())

            return web.json_response({
                'status': 'started',
                'pid': self.active_scrape.pid
            })

        except Exception as e:
            logger.error(f"Error starting scrape: {e}")
            return web.json_response({
                'error': str(e)
            }, status=500)

    async def monitor_scrape(self):
        """Monitor active scrape process"""
        if self.active_scrape:
            await self.active_scrape.wait()
            self.active_scrape = None
            logger.info("Scraping process completed")

    async def get_scrape_status(self, request):
        """Get current scrape status"""
        if self.active_scrape:
            # Try to read the latest log
            log_content = []
            try:
                with open('web_archiver.log', 'r') as f:
                    # Get last 50 lines
                    log_content = f.readlines()[-50:]
            except:
                pass

            return web.json_response({
                'status': 'running',
                'pid': self.active_scrape.pid,
                'log': log_content
            })

        return web.json_response({
            'status': 'idle'
        })

    async def stop_scrape(self, request):
        """Stop active scrape"""
        if self.active_scrape:
            self.active_scrape.terminate()
            await self.active_scrape.wait()
            self.active_scrape = None

            return web.json_response({
                'status': 'stopped'
            })

        return web.json_response({
            'error': 'No active scrape'
        }, status=400)

    async def get_scrape_config(self, request):
        """Get current scraping configuration"""
        load_dotenv()

        config = {
            'START_URL': os.getenv('START_URL', 'https://example.com'),
            'MAX_WORKERS': int(os.getenv('MAX_WORKERS', 10)),
            'MAX_DEPTH': int(os.getenv('MAX_DEPTH', 3)),
            'MAX_PAGES': int(os.getenv('MAX_PAGES', 100)),
            'PAGES_PER_DOMAIN': int(os.getenv('PAGES_PER_DOMAIN', 50)),
            'IMAGE_QUALITY': int(os.getenv('IMAGE_QUALITY', 85)),
            'MAX_IMAGE_WIDTH': int(os.getenv('MAX_IMAGE_WIDTH', 1920)),
            'COMPRESSION_LEVEL': int(os.getenv('COMPRESSION_LEVEL', 19)),
            'SKIP_ASSETS': os.getenv('SKIP_ASSETS', 'false').lower() == 'true',
            'RESPECT_ROBOTS_TXT': os.getenv('RESPECT_ROBOTS_TXT', 'true').lower() == 'true',
            'REQUEST_DELAY': float(os.getenv('REQUEST_DELAY', 0.5))
        }

        return web.json_response(config)

    async def get_runs(self, request):
        """Get all scraping runs"""
        runs = []

        if self.scraped_data_dir.exists():
            for run_dir in sorted(self.scraped_data_dir.iterdir(), reverse=True):
                if run_dir.is_dir() and run_dir.name.replace('_', '').isdigit():
                    metadata_file = run_dir / 'metadata.json'
                    if metadata_file.exists():
                        with open(metadata_file, 'r') as f:
                            metadata = json.load(f)

                        runs.append({
                            'id': run_dir.name,
                            'timestamp': metadata.get('timestamp', run_dir.name),
                            'start_url': metadata.get('start_url', ''),
                            'total_pages': metadata.get('total_pages', 0),
                            'pages_scraped': metadata.get('pages_scraped', 0),
                            'stats': metadata.get('stats', {})
                        })

        return web.json_response(runs)

    async def get_run_details(self, request):
        """Get details for a specific run"""
        run_id = request.match_info['run_id']
        run_dir = self.scraped_data_dir / run_id

        if not run_dir.exists():
            return web.json_response({'error': 'Run not found'}, status=404)

        metadata_file = run_dir / 'metadata.json'
        if not metadata_file.exists():
            return web.json_response({'error': 'Metadata not found'}, status=404)

        with open(metadata_file, 'r') as f:
            metadata = json.load(f)

        # Check for compression report
        compression_report = None
        for archive_file in self.archives_dir.glob('compression_report.json'):
            with open(archive_file, 'r') as f:
                report = json.load(f)
                if run_id in report.get('source_directory', ''):
                    compression_report = report
                    break

        return web.json_response({
            'id': run_id,
            'metadata': metadata,
            'compression_report': compression_report
        })

    async def get_run_pages(self, request):
        """Get all pages for a specific run"""
        run_id = request.match_info['run_id']
        run_dir = self.scraped_data_dir / run_id

        # Pagination parameters
        page = int(request.query.get('page', 1))
        per_page = int(request.query.get('per_page', 50))
        search = request.query.get('search', '').lower()
        domain_filter = request.query.get('domain', '')

        metadata_file = run_dir / 'metadata.json'
        if not metadata_file.exists():
            return web.json_response({'error': 'Metadata not found'}, status=404)

        with open(metadata_file, 'r') as f:
            metadata = json.load(f)

        # Get pages and apply filters
        pages = metadata.get('pages', {})
        filtered_pages = []

        for url, page_data in pages.items():
            # Apply search filter
            if search and search not in url.lower() and search not in page_data.get('domain', '').lower():
                continue

            # Apply domain filter
            if domain_filter and page_data.get('domain', '') != domain_filter:
                continue

            filtered_pages.append({
                'url': url,
                'hash': self.get_url_hash(url),
                **page_data
            })

        # Sort by timestamp
        filtered_pages.sort(key=lambda x: x.get('timestamp', ''), reverse=True)

        # Pagination
        total = len(filtered_pages)
        start = (page - 1) * per_page
        end = start + per_page

        return web.json_response({
            'pages': filtered_pages[start:end],
            'pagination': {
                'page': page,
                'per_page': per_page,
                'total': total,
                'total_pages': (total + per_page - 1) // per_page
            }
        })

    async def get_page_content(self, request):
        """Get content of a specific page"""
        run_id = request.match_info['run_id']
        page_hash = request.match_info['page_hash']

        run_dir = self.scraped_data_dir / run_id

        # Try to find the file in html or assets directory
        for subdir in ['html', 'assets']:
            for ext in ['.html', '.css', '.js', '.json', '.txt']:
                filepath = run_dir / subdir / f"{page_hash}{ext}"
                if filepath.exists():
                    async with aiofiles.open(filepath, 'r', encoding='utf-8') as f:
                        content = await f.read()

                    return web.json_response({
                        'content': content,
                        'type': ext[1:],
                        'size': len(content)
                    })

        return web.json_response({'error': 'Page content not found'}, status=404)

    async def get_run_stats(self, request):
        """Get statistics for a specific run"""
        run_id = request.match_info['run_id']
        run_dir = self.scraped_data_dir / run_id

        metadata_file = run_dir / 'metadata.json'
        if not metadata_file.exists():
            return web.json_response({'error': 'Metadata not found'}, status=404)

        with open(metadata_file, 'r') as f:
            metadata = json.load(f)

        stats = metadata.get('stats', {})
        domain_counts = metadata.get('domain_counts', {})

        # Calculate additional stats
        pages = metadata.get('pages', {})
        content_types = {}
        depths = {}

        for page_data in pages.values():
            # Content type distribution
            ct = page_data.get('content_type', 'unknown')
            ct_simple = ct.split(';')[0].strip()
            content_types[ct_simple] = content_types.get(ct_simple, 0) + 1

            # Depth distribution
            depth = page_data.get('depth', 0)
            depths[str(depth)] = depths.get(str(depth), 0) + 1

        return web.json_response({
            'basic_stats': stats,
            'domain_distribution': domain_counts,
            'content_types': content_types,
            'depth_distribution': depths,
            'top_domains': sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10]
        })

    async def get_archives(self, request):
        """Get all archives"""
        archives = []

        if self.archives_dir.exists():
            for archive_file in sorted(self.archives_dir.glob('*.tar.*'), reverse=True):
                archives.append({
                    'name': archive_file.name,
                    'size': archive_file.stat().st_size,
                    'created': datetime.fromtimestamp(archive_file.stat().st_mtime).isoformat()
                })

        return web.json_response(archives)

    def get_url_hash(self, url):
        """Generate hash for URL"""
        import hashlib
        return hashlib.md5(url.encode()).hexdigest()

    async def start(self):
        """Start the server"""
        runner = web.AppRunner(self.app)
        await runner.setup()
        site = web.TCPSite(runner, 'localhost', self.port)
        await site.start()

        logger.info(f"Server started at http://localhost:{self.port}")
        logger.info("Press Ctrl+C to stop")

        # Keep the server running
        try:
            await asyncio.Event().wait()
        except KeyboardInterrupt:
            logger.info("Server stopped")

async def main():
    server = WebArchiveServer()
    await server.start()

if __name__ == '__main__':
    asyncio.run(main())