-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_usage.py
More file actions
executable file
·183 lines (140 loc) · 5.47 KB
/
example_usage.py
File metadata and controls
executable file
·183 lines (140 loc) · 5.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3
"""
Example usage of the content scraper system.
Demonstrates how to use the scraper programmatically.
"""
from datetime import datetime, timedelta
from loguru import logger
from config.settings import get_author_config
from scrapers.blog_scraper import BlogScraper
from scrapers.twitter_scraper import TwitterScraper
from validators.authenticity_validator import AuthenticityValidator
from storage.database import ContentDatabase
from processing.text_processor import TextProcessor
def example_blog_scrape():
"""Example: Scrape Tim Ferriss blog posts."""
print("\n=== Example 1: Blog Scraping ===\n")
# Get author configuration
author_config = get_author_config('tim_ferriss')
# Initialize scraper
scraper = BlogScraper('tim_ferriss', author_config)
# Scrape recent posts
date_from = datetime.now() - timedelta(days=365) # Last year
content = scraper.scrape(max_pages=5, date_from=date_from)
print(f"Scraped {len(content)} blog posts")
# Show first item
if content:
first_post = content[0]
print(f"\nFirst post: {first_post['title']}")
print(f"URL: {first_post['url']}")
print(f"Published: {first_post['date_published']}")
print(f"Word count: {first_post['metadata']['word_count']}")
def example_validation():
"""Example: Validate content authenticity."""
print("\n=== Example 2: Content Validation ===\n")
# Create a sample content object
sample_content = {
'id': 'test123',
'author': 'tim_ferriss',
'platform': 'blog',
'title': 'Test Post',
'content': 'This is a test post about productivity and life optimization.',
'url': 'https://tim.blog/test-post/',
'metadata': {}
}
# Validate
validator = AuthenticityValidator()
validated = validator.validate(sample_content)
print(f"Authenticity score: {validated['authenticity_score']}")
print(f"Passed validation: {validated['metadata']['validation']['passed']}")
def example_database():
"""Example: Store and retrieve from database."""
print("\n=== Example 3: Database Operations ===\n")
# Initialize database
db = ContentDatabase()
# Get statistics
stats = db.get_statistics()
print(f"Total items in database: {stats.get('total_content', 0)}")
# Get content for specific author
tim_content = db.get_content_by_author('tim_ferriss', limit=5)
print(f"\nFound {len(tim_content)} items for Tim Ferriss")
if tim_content:
print(f"\nLatest item: {tim_content[0]['title']}")
def example_text_processing():
"""Example: Process text content."""
print("\n=== Example 4: Text Processing ===\n")
# Sample text
text = """
The 4-Hour Workweek is about lifestyle design and automation.
Tim Ferriss discusses productivity, outsourcing, and mini-retirements.
The book covers topics like elimination, automation, and liberation.
"""
# Initialize processor
processor = TextProcessor()
# Extract keywords
keywords = processor.extract_keywords(text)
print(f"Keywords: {', '.join(keywords)}")
# Chunk text
chunks = processor.chunk_text(text, chunk_size=100, overlap=20)
print(f"\nText split into {len(chunks)} chunks")
# Calculate readability
readability = processor.calculate_readability(text)
print(f"Readability score: {readability:.1f}/100")
def example_full_pipeline():
"""Example: Full scraping pipeline."""
print("\n=== Example 5: Full Pipeline ===\n")
# 1. Scrape content
author_config = get_author_config('balaji_srinivasan')
scraper = BlogScraper('balaji_srinivasan', author_config)
print("Step 1: Scraping...")
content = scraper.scrape(max_pages=2)
print(f"✓ Scraped {len(content)} items")
if not content:
print("No content scraped. Exiting.")
return
# 2. Validate
print("\nStep 2: Validating...")
validator = AuthenticityValidator()
validated = validator.validate_batch(content)
authentic = [c for c in validated if c['authenticity_score'] >= 75]
print(f"✓ {len(authentic)} items passed validation")
# 3. Process
print("\nStep 3: Processing...")
processor = TextProcessor()
processed = [processor.process(c) for c in authentic]
print(f"✓ Processed {len(processed)} items")
# 4. Store
print("\nStep 4: Storing...")
db = ContentDatabase()
saved = db.save_batch(processed)
print(f"✓ Saved {saved} items to database")
# 5. Retrieve and display
print("\nStep 5: Retrieving...")
stored_content = db.get_content_by_author('balaji_srinivasan', limit=1)
if stored_content:
item = stored_content[0]
print(f"\nSample stored item:")
print(f" Title: {item['title']}")
print(f" Platform: {item['platform']}")
print(f" Authenticity: {item['authenticity_score']}")
print(f" Processed: {item['processed']}")
print(f" Keywords: {', '.join(item['metadata'].get('keywords', [])[:5])}")
def main():
"""Run all examples."""
print("=" * 60)
print("Content Scraper - Example Usage")
print("=" * 60)
try:
# Run examples
example_blog_scrape()
example_validation()
example_database()
example_text_processing()
example_full_pipeline()
except Exception as e:
logger.error(f"Error in examples: {e}", exc_info=True)
print("\n" + "=" * 60)
print("Examples completed!")
print("=" * 60)
if __name__ == '__main__':
main()