deep_learning_class_project/dataset_analysis.py at master · besitocat/deep_learning_class_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Mar  8 17:48:22 2018

@author: evita
"""

from __future__ import division
import pandas as pd
import os
import nltk
import string
import re
import seaborn as sns
from collections import Counter
from ast import literal_eval
import numpy as np
import matplotlib.pyplot as plt
import preprocess as data_prep

root_sarcasm_data_dir = "../sarcasm_data/" #put the data (train-balanced-sarcasm.csv)

                                        #in a parent folder named "sarcasm_data"
sarcasm_file = "train-balanced-sarcasm.csv"
train_file = 'train.csv'
train_with_stopwords_file = 'with_stopwords_train_cleaned.csv'
test_with_stopwords_file = 'with_stopwords_test_cleaned.csv'
val_with_stopwords_file = 'with_stopwords_val_cleaned.csv'
test_file = 'test.csv'
validate_file = 'validate.csv'
train_file_cleaned =  "train_cleaned.csv"
validate_file_cleaned = "validate_cleaned.csv"
test_file_cleaned = "test_cleaned.csv"
yelp_file = "review_sampled.csv"

def load_data(root_sarcasm, sarcasm_file, subset_size=None):
    print "\n**** Loading data****"
    print "**** loading file.. :" + sarcasm_file
    if not os.path.exists(root_sarcasm):
        os.makedirs(root_sarcasm)
    df = pd.read_csv(root_sarcasm + sarcasm_file)
    if subset_size is not None:
        df=df.sample(n=subset_size)
    return df


def plot_lengths(df, column):
    lengths = list()
    words = set()
    threshold_count = 0
    for l in df[column]:
        lengths.append(len(l))
        if len(l)>=50:
            threshold_count +=1
        for w in l:
            words.add(w)
    print "total comments:", len(lengths)
    print "avg comment length:", np.mean(lengths)
    total_words = np.sum(lengths)
    print "total (non unique) words:", total_words
    print "total unique words:", len(words)
    print "total comments above the threshold count (100):", threshold_count
    x = list(range(0,len(lengths)))
#    plt.plot(lengths,x)
    df = pd.DataFrame(lengths, columns=['length'])
    bins = range(0,200,10)
    plt.hist(lengths, bins=bins)
    plt.legend()

def load_preprocessed_file(filename):
    print "\n**** LOADING PREPROCESSED FILE: " + filename + " ..."
    column_names = ['label','clean_comments']
    df = pd.read_csv(root_sarcasm_data_dir + filename, usecols = column_names,
                                     converters={"clean_comments": literal_eval})
    df_data = df.drop(['label'], axis=1)
    df_target = df.drop(['clean_comments'], axis=1)
    print "total positive vs negative examples in dataset:\n", df_target['label'].value_counts()
    return df_data, df_target

def get_vocabulary_size(df):
    vocabulary = set()
    total_comments = df.shape[0]
    df2 = df['comment'].astype(str)
    vocabulary_after = set()
#    df2 = df2.apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)
   # df['clean_comments'] = df['comment'].apply(nltk.word_tokenize)
    df2.str.split().apply(vocabulary.update)
    print "total vocab:", len(vocabulary)
    print "average senquence size:", len(vocabulary)/total_comments
    print "shape1:", df2.shape
    df['clean_comment'] = df['comment'].astype(str).apply(lambda x:''.join([\
          re.sub('[^a-z\s]', '', i.lower()) for i in x if i not in string.punctuation]))
    df['clean_comment'] = df['clean_comment'].apply(nltk.word_tokenize)
    df['empty_list_comments'] = df['clean_comment'].apply(lambda c: c==[])
    df.drop(df[df['empty_list_comments']  == True].index, inplace=True)
    print "after shape:", df.shape
    print "df3 after:", df.shape
    df['clean_comment'].apply(vocabulary_after.update)
    print "voc afer:", len(vocabulary_after)
    print "avg seq size:", len(vocabulary_after)/df.shape[0]
#    df4 = df['clean_comments'].apply(lambda x: [item for item in x]).apply(vocabulary_after2.update)
#    total = Counter(" ".join(df['comment']).split(" ")).items()


def dataset_analysis():
    df = load_data(root_sarcasm_data_dir, sarcasm_file)
    print "total comments:", df.shape
    #get_vocabulary_size(df)
#    df_train = load_data(root_sarcasm_data_dir, train_file)
    df_train_cleaned_data, df_train_labels = load_preprocessed_file(train_file_cleaned)
#    df_trunc = data_prep.truncate_document(df = df_train_cleaned_data, max_length = 120)
    print "results for train with stopword removal"
    plot_lengths(df_train_cleaned_data, 'clean_comments')

    df_test_cleaned_data, df_train_labels = load_preprocessed_file(test_file_cleaned)
#    df_trunc = data_prep.truncate_document(df = df_train_cleaned_data, max_length = 120)
    print "results for train with stopword removal"
    plot_lengths(df_test_cleaned_data, 'clean_comments')


#    print "results without stopword removal"
#    df_train_cleaned_with_stop_data, df_train_stop_labels = load_preprocessed_file(train_with_stopwords_file)
#    plot_lengths(df_train_cleaned_with_stop_data, 'clean_comments')

    df_validate_cleaned_data, df_val_labels = load_preprocessed_file(validate_file_cleaned)
    print "results for validate with stopword removal"
    plot_lengths(df_validate_cleaned_data, 'clean_comments')

def main():
    dataset_analysis()

if __name__ == '__main__':
    main()