Python_Reddit_project1/generate_visualizations.py at main · SunnyDevendranadh/Python_Reddit_project1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast
from collections import Counter

# Read the data
df = pd.read_csv('posts_data.csv')

# Flatten comments
flattened_comments = []
for idx, row in df.iterrows():
    post_title = row['title']
    post_body = row['body']
    try:
        comments = ast.literal_eval(row['comments'])
        for comment in comments:
            flattened_comments.append({
                'post_title': post_title,
                'post_body': post_body,
                'comment': comment
            })
    except (ValueError, SyntaxError):
        continue

comments_df = pd.DataFrame(flattened_comments)

# Define use cases and pain points
use_case_keywords = {
    'coding assistance': ['code', 'coding', 'programming', 'developer', 'debug', 'script'],
    'research': ['research', 'source', 'find information', 'lookup', 'deep dive'],
    'writing/editing': ['writing', 'writer', 'edit', 'grammar', 'proofread', 'rewrite'],
    'summarization': ['summarize', 'summary', 'tl;dr', 'too long', 'condense'],
    'problem-solving': ['solve', 'solution', 'problem', 'stuck', 'fix'],
    'learning new concepts': ['learn', 'understand', 'teach', 'explain', 'study']
}

pain_point_keywords = {
    'too verbose': ['too long', 'verbose', 'overly detailed', 'ramble'],
    'lack of accuracy': ['inaccurate', 'wrong', 'error', 'mistake'],
    'context issues': ['forgot context', 'lost context', "doesn't remember", 'context reset'],
    'slow performance': ['slow', 'lag', 'takes too long', 'delay'],
    'limitations/frustrations': ["can't do", 'limit', 'frustrating', 'annoying', 'disappointing'],
    'dependency issues': ['dependency', 'import error', 'module not found', 'requirements.txt']
}

def tag_usecases(comment_text):
    if pd.isna(comment_text):
        return []
    tags = []
    comment_text = str(comment_text).lower()
    for category, keywords in use_case_keywords.items():
        if any(re.search(r'\b' + re.escape(word) + r'\b', comment_text) for word in keywords):
            tags.append(category)
    return tags

def tag_pain_points(comment_text):
    if pd.isna(comment_text):
        return []
    tags = []
    comment_text = str(comment_text).lower()
    for category, keywords in pain_point_keywords.items():
        if any(re.search(r'\b' + re.escape(word) + r'\b', comment_text) for word in keywords):
            tags.append(category)
    return tags

# Apply tagging
comments_df['use_case_tags'] = comments_df['comment'].apply(tag_usecases)
comments_df['pain_point_tags'] = comments_df['comment'].apply(tag_pain_points)

# 1. Use Case Distribution
plt.figure(figsize=(12, 6))
all_use_cases = [tag for tags in comments_df['use_case_tags'] for tag in tags]
use_case_counts = pd.Series(all_use_cases).value_counts()

sns.barplot(x=use_case_counts.values, y=use_case_counts.index, palette='viridis')
plt.title('Distribution of Use Cases in Claude AI Discussions', fontsize=14, pad=20)
plt.xlabel('Number of Mentions', fontsize=12)
plt.ylabel('Use Case', fontsize=12)

# Add value labels
for i, v in enumerate(use_case_counts.values):
    plt.text(v, i, str(v), va='center', fontsize=10)

plt.tight_layout()
plt.savefig('visualizations/use_case_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Pain Point Distribution
plt.figure(figsize=(12, 6))
all_pain_points = [tag for tags in comments_df['pain_point_tags'] for tag in tags]
pain_point_counts = pd.Series(all_pain_points).value_counts()

sns.barplot(x=pain_point_counts.values, y=pain_point_counts.index, palette='Reds')
plt.title('Distribution of Pain Points in Claude AI Discussions', fontsize=14, pad=20)
plt.xlabel('Number of Mentions', fontsize=12)
plt.ylabel('Pain Point', fontsize=12)

# Add value labels
for i, v in enumerate(pain_point_counts.values):
    plt.text(v, i, str(v), va='center', fontsize=10)

plt.tight_layout()
plt.savefig('visualizations/pain_point_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Use Case vs Pain Point Correlation
use_case_pain_point_matrix = pd.DataFrame(0,
                                         index=use_case_keywords.keys(),
                                         columns=pain_point_keywords.keys())

for idx, row in comments_df.iterrows():
    use_cases = row['use_case_tags']
    pain_points = row['pain_point_tags']

    for use_case in use_cases:
        for pain_point in pain_points:
            use_case_pain_point_matrix.loc[use_case, pain_point] += 1

plt.figure(figsize=(12, 8))
sns.heatmap(use_case_pain_point_matrix, annot=True, fmt='g', cmap='YlOrRd')
plt.title('Correlation between Use Cases and Pain Points', fontsize=14, pad=20)
plt.xlabel('Pain Points', fontsize=12)
plt.ylabel('Use Cases', fontsize=12)
plt.tight_layout()
plt.savefig('visualizations/use_case_pain_point_correlation.png', dpi=300, bbox_inches='tight')
plt.close()

print("Visualizations have been generated in the 'visualizations' directory.")