-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
78 lines (62 loc) · 3.01 KB
/
utils.py
File metadata and controls
78 lines (62 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Module: Helper Functions for Sentiment Analysis
Author: Silvio Christian, Joe
Description:
This module contains utility functions for text processing and file handling.
It includes logic for N-gram extraction (finding top frequent words/phrases)
specifically tailored for English and Indonesian contexts, as well as
data serialization for Streamlit downloads.
"""
from sklearn.feature_extraction.text import CountVectorizer
import nltk
# Ensure stopwords are downloaded quietly to avoid console clutter
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
def get_top_n_words_en(corpus, n, ngram_range):
"""
Extracts the top N most frequent words or n-grams from an English corpus.
Args:
corpus (Series): The text data to analyze.
n (int): Number of top words to return.
ngram_range (tuple): The range of n-values for different n-grams (e.g., (1, 1) for unigrams).
Returns:
list: A sorted list of tuples [(word, frequency), ...]
"""
# Initialize Vectorizer with standard English stopwords
vec = CountVectorizer(ngram_range=ngram_range, stop_words='english').fit(corpus)
# Transform corpus into a sparse matrix of counts
bag = vec.transform(corpus)
# Sum word occurrences across all documents
sum_words = bag.sum(axis=0)
# Map vocabulary indices back to words and their frequencies
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
# Sort by frequency in descending order and slice the top N
return sorted(words_freq, key=lambda x: x[1], reverse=True)[:n]
def get_top_n_words_id(corpus, n, ngram_range):
"""
Extracts the top N most frequent words or n-grams from an Indonesian corpus.
Includes custom stopword handling to preserve sentiment-bearing words.
"""
# Load standard Indonesian stopwords from NLTK
stopword = stopwords.words('indonesian')
# Custom Stopword Adjustment:
# "Luar" and "Biasa" are standard stopwords, but "Luar Biasa" (Extraordinary)
# is a strong sentiment indicator. We remove them to preserve this meaning.
if "luar" in stopword or "biasa" in stopword:
stopword.remove("luar")
stopword.remove("biasa")
# Initialize Vectorizer with the modified Indonesian stopword list
vec = CountVectorizer(ngram_range=ngram_range, stop_words=stopword).fit(corpus)
# Transform corpus into a sparse matrix
bag = vec.transform(corpus)
# Sum word occurrences
sum_words = bag.sum(axis=0)
# Map vocabulary and sort results
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
return sorted(words_freq, key=lambda x: x[1], reverse=True)[:n]
def convert_for_download(df):
"""
Serializes a Pandas DataFrame into a CSV byte string.
Required for Streamlit's download_button widget.
"""
return df.to_csv(index=False).encode("utf-8")