Natural-language-AI/text_classifier.py at main · Reaishma/Natural-language-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
Text Classification Module
Classifies text into predefined categories using TextBlob and basic heuristics
"""
import streamlit as st
from textblob import TextBlob
import re
from typing import Dict, Any
import pandas as pd

class TextClassifier:
    def __init__(self):
        self.categories = {
            "technology": ["computer", "software", "tech", "programming", "code", "app", "digital", "internet", "ai", "machine learning"],
            "business": ["company", "market", "finance", "money", "profit", "sales", "business", "corporate", "investment"],
            "sports": ["game", "player", "team", "score", "match", "sport", "football", "basketball", "soccer", "tennis"],
            "health": ["doctor", "medicine", "hospital", "health", "disease", "treatment", "medical", "patient", "therapy"],
            "education": ["school", "student", "teacher", "learn", "education", "university", "study", "class", "academic"],
            "entertainment": ["movie", "music", "show", "actor", "celebrity", "film", "concert", "entertainment", "tv"],
            "news": ["breaking", "report", "news", "journalist", "headline", "story", "media", "press"],
            "personal": ["i", "me", "my", "myself", "personal", "life", "family", "friend", "relationship"]
        }

    def classify_text(self, text: str) -> Dict[str, Any]:
        """
        Classify text into categories based on keyword matching and sentiment
        """
        try:
            if not text or len(text.strip()) < 3:
                return {
                    "category": "unknown",
                    "confidence": 0.0,
                    "error": "Text too short for classification"
                }

            text_lower = text.lower()
            blob = TextBlob(text)

            # Calculate category scores
            category_scores = {}
            for category, keywords in self.categories.items():
                score = 0
                for keyword in keywords:
                    # Count keyword occurrences (with word boundaries)
                    matches = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text_lower))
                    score += matches

                # Normalize score by text length
                if len(text.split()) > 0:
                    category_scores[category] = score / len(text.split())
                else:
                    category_scores[category] = 0

            # Find the category with highest score
            if max(category_scores.values()) > 0:
                best_category = max(category_scores, key=category_scores.get)
                confidence = min(category_scores[best_category] * 2, 1.0)  # Scale confidence
            else:
                # Fallback: use sentiment to determine if it's personal or general
                sentiment = blob.sentiment.polarity
                if abs(sentiment) > 0.3:  # Strong sentiment suggests personal content
                    best_category = "personal"
                    confidence = abs(sentiment) * 0.7
                else:
                    best_category = "general"
                    confidence = 0.3

            return {
                "category": best_category,
                "confidence": confidence,
                "all_scores": category_scores,
                "text_length": len(text),
                "word_count": len(text.split())
            }

        except Exception as e:
            return {
                "category": "error",
                "confidence": 0.0,
                "error": f"Classification failed: {str(e)}"
            }

    def get_category_description(self, category: str) -> str:
        """
        Get description for a category
        """
        descriptions = {
            "technology": "Technology-related content including computers, software, and digital topics",
            "business": "Business and finance-related content",
            "sports": "Sports and athletics-related content",
            "health": "Health and medical-related content",
            "education": "Educational and academic content",
            "entertainment": "Entertainment industry and media content",
            "news": "News and journalism content",
            "personal": "Personal experiences and opinions",
            "general": "General content that doesn't fit specific categories",
            "unknown": "Unable to classify the content"
        }
        return descriptions.get(category, "No description available")

# Streamlit interface for text classification
def create_text_classification_interface():
    """
    Create the Streamlit interface for text classification
    """
    st.header("📝 Text Classification")
    st.write("Classify your text into different categories to understand its content type.")

    classifier = TextClassifier()

    # Input options
    input_method = st.radio("Choose input method:", ["Text Input", "File Upload"])

    text_to_classify = ""

    if input_method == "Text Input":
        text_to_classify = st.text_area(
            "Enter text to classify:",
            height=150,
            placeholder="Type or paste your text here..."
        )
    else:
        uploaded_file = st.file_uploader(
            "Upload a text file:",
            type=['txt'],
            help="Upload a .txt file to classify its content"
        )
        if uploaded_file:
            from utils.helpers import handle_file_upload
            text_to_classify = handle_file_upload(uploaded_file)

    if st.button("Classify Text", type="primary"):
        if text_to_classify and len(text_to_classify.strip()) > 0:
            with st.spinner("Classifying text..."):
                results = classifier.classify_text(text_to_classify)

                if "error" not in results:
                    # Display main result
                    col1, col2 = st.columns(2)
                    with col1:
                        st.metric("Category", results["category"].title())
                    with col2:
                        st.metric("Confidence", f"{results['confidence']:.1%}")

                    # Display category description
                    description = classifier.get_category_description(results["category"])
                    st.info(f"**Category Description:** {description}")

                    # Display detailed scores
                    if "all_scores" in results:
                        st.subheader("Detailed Category Scores")
                        scores_df = pd.DataFrame([
                            {"Category": cat.title(), "Score": score}
                            for cat, score in results["all_scores"].items()
                        ]).sort_values("Score", ascending=False)

                        st.dataframe(scores_df, use_container_width=True)

                        # Show text statistics
                        st.subheader("Text Statistics")
                        stat_col1, stat_col2 = st.columns(2)
                        with stat_col1:
                            st.metric("Characters", results.get("text_length", 0))
                        with stat_col2:
                            st.metric("Words", results.get("word_count", 0))
                else:
                    from utils.helpers import display_error
                    display_error(results["error"])
        else:
            from utils.helpers import display_error
            display_error("Please enter some text to classify.")

if __name__ == "__main__":
    import pandas as pd
    create_text_classification_interface()