diff --git a/cybersecurity_eda.ipynb b/cybersecurity_eda.ipynb
new file mode 100644
index 0000000..fae9856
--- /dev/null
+++ b/cybersecurity_eda.ipynb
@@ -0,0 +1,1271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cybersecurity Attack Type Detection - EDA\n",
+    "## Focus: Proxy + IP Trends, Spoofing Detection, and Data Bin Trends\n",
+    "\n",
+    "**Team Member:** [Your Name]  \n",
+    "**Date:** January 31, 2026  \n",
+    "**Dataset:** 40,000 rows, 25 features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 1. Setup and Data Loading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Set visualization style\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "plt.rcParams['figure.figsize'] = (15, 8)\n",
+    "plt.rcParams['font.size'] = 10\n",
+    "\n",
+    "print(\"✓ Libraries imported successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load dataset\n",
+    "# TODO: Update the filepath to your actual CSV file location\n",
+    "df = pd.read_csv('your_dataset.csv')\n",
+    "\n",
+    "print(f\"Dataset Shape: {df.shape}\")\n",
+    "print(f\"Total Records: {df.shape[0]:,}\")\n",
+    "print(f\"Total Features: {df.shape[1]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Display first few rows\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data types and basic info\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Missing values analysis\n",
+    "missing_df = pd.DataFrame({\n",
+    "    'Missing_Count': df.isnull().sum(),\n",
+    "    'Percentage': (df.isnull().sum() / len(df)) * 100,\n",
+    "    'Distinct_Count': df.nunique(),\n",
+    "    'Distinct_Percentage': (df.nunique() / len(df)) * 100\n",
+    "}).sort_values('Missing_Count', ascending=False)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"MISSING VALUES AND DISTINCTNESS ANALYSIS\")\n",
+    "print(\"=\"*80)\n",
+    "print(missing_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Attack Type distribution\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    print(\"\\nAttack Type Distribution:\")\n",
+    "    print(df['Attack Type'].value_counts())\n",
+    "    \n",
+    "    plt.figure(figsize=(12, 6))\n",
+    "    df['Attack Type'].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')\n",
+    "    plt.title('Attack Type Distribution', fontsize=14, fontweight='bold')\n",
+    "    plt.xlabel('Attack Type')\n",
+    "    plt.ylabel('Count')\n",
+    "    plt.xticks(rotation=45, ha='right')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 2. PART 1: Proxy Information Analysis\n",
+    "\n",
+    "**Key Insights from Data Profiling:**\n",
+    "- 50% missing values (19,851 out of 40,000)\n",
+    "- 20,148 distinct values when present (highly diverse)\n",
+    "- This suggests proxy info is present only for certain attacks/sources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Proxy Information Analysis\n",
+    "print(\"=\"*80)\n",
+    "print(\"PROXY INFORMATION ANALYSIS\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "if 'Proxy Information' in df.columns:\n",
+    "    # Basic statistics\n",
+    "    total_records = len(df)\n",
+    "    proxy_present = df['Proxy Information'].notna().sum()\n",
+    "    proxy_missing = df['Proxy Information'].isna().sum()\n",
+    "    unique_proxies = df['Proxy Information'].nunique()\n",
+    "    \n",
+    "    print(f\"\\nProxy Information Statistics:\")\n",
+    "    print(f\"  - Total records: {total_records:,}\")\n",
+    "    print(f\"  - Records WITH proxy info: {proxy_present:,} ({proxy_present/total_records*100:.2f}%)\")\n",
+    "    print(f\"  - Records WITHOUT proxy info: {proxy_missing:,} ({proxy_missing/total_records*100:.2f}%)\")\n",
+    "    print(f\"  - Unique proxy values: {unique_proxies:,}\")\n",
+    "    \n",
+    "    # Create binary feature: has_proxy\n",
+    "    df['has_proxy'] = df['Proxy Information'].notna().astype(int)\n",
+    "    \n",
+    "    print(f\"\\nProxy Usage Distribution:\")\n",
+    "    print(df['has_proxy'].value_counts())\n",
+    "else:\n",
+    "    print(\"Warning: 'Proxy Information' column not found!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize proxy usage patterns\n",
+    "if 'has_proxy' in df.columns:\n",
+    "    fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
+    "    \n",
+    "    # 1. Overall proxy usage pie chart\n",
+    "    proxy_counts = df['has_proxy'].value_counts()\n",
+    "    labels = ['No Proxy', 'With Proxy']\n",
+    "    colors = ['lightcoral', 'lightgreen']\n",
+    "    axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', \n",
+    "                    colors=colors, startangle=90)\n",
+    "    axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold')\n",
+    "    \n",
+    "    # 2. Proxy usage by Attack Type\n",
+    "    if 'Attack Type' in df.columns:\n",
+    "        proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100\n",
+    "        proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, \n",
+    "                         color=['lightcoral', 'lightgreen'])\n",
+    "        axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold')\n",
+    "        axes[0, 1].set_xlabel('Attack Type')\n",
+    "        axes[0, 1].set_ylabel('Percentage')\n",
+    "        axes[0, 1].legend(['No Proxy', 'With Proxy'])\n",
+    "        axes[0, 1].tick_params(axis='x', rotation=45)\n",
+    "        \n",
+    "        # Print statistical summary\n",
+    "        print(\"\\nProxy Usage by Attack Type:\")\n",
+    "        print(proxy_attack)\n",
+    "    \n",
+    "    # 3. Proxy usage by Severity Level\n",
+    "    if 'Severity Level' in df.columns:\n",
+    "        proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy'])\n",
+    "        proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen'])\n",
+    "        axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold')\n",
+    "        axes[1, 0].set_xlabel('Severity Level')\n",
+    "        axes[1, 0].set_ylabel('Count')\n",
+    "        axes[1, 0].legend(['No Proxy', 'With Proxy'])\n",
+    "        axes[1, 0].tick_params(axis='x', rotation=45)\n",
+    "    \n",
+    "    # 4. Proxy usage over time\n",
+    "    if 'Timestamp' in df.columns:\n",
+    "        df_temp = df.copy()\n",
+    "        df_temp['Timestamp'] = pd.to_datetime(df_temp['Timestamp'], errors='coerce')\n",
+    "        df_temp = df_temp.dropna(subset=['Timestamp'])\n",
+    "        df_temp['Date'] = df_temp['Timestamp'].dt.date\n",
+    "        \n",
+    "        proxy_time = df_temp.groupby('Date')['has_proxy'].agg(['sum', 'count'])\n",
+    "        proxy_time['percentage'] = (proxy_time['sum'] / proxy_time['count']) * 100\n",
+    "        \n",
+    "        axes[1, 1].plot(proxy_time.index, proxy_time['percentage'], \n",
+    "                       marker='o', color='steelblue', linewidth=2)\n",
+    "        axes[1, 1].set_title('Proxy Usage Trend Over Time', fontsize=14, fontweight='bold')\n",
+    "        axes[1, 1].set_xlabel('Date')\n",
+    "        axes[1, 1].set_ylabel('Percentage Using Proxy')\n",
+    "        axes[1, 1].tick_params(axis='x', rotation=45)\n",
+    "        axes[1, 1].grid(True, alpha=0.3)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze relationship with Log Source (Firewall vs Server)\n",
+    "if 'Log Source' in df.columns and 'has_proxy' in df.columns:\n",
+    "    print(\"\\nProxy Usage by Log Source:\")\n",
+    "    log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100\n",
+    "    print(log_proxy)\n",
+    "    \n",
+    "    # Visualize\n",
+    "    log_proxy.plot(kind='bar', figsize=(10, 6), color=['lightcoral', 'lightgreen'])\n",
+    "    plt.title('Proxy Usage: Firewall vs Server Logs', fontsize=14, fontweight='bold')\n",
+    "    plt.xlabel('Log Source')\n",
+    "    plt.ylabel('Percentage')\n",
+    "    plt.legend(['No Proxy', 'With Proxy'])\n",
+    "    plt.xticks(rotation=0)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Statistical significance test\n",
+    "if 'Attack Type' in df.columns and 'has_proxy' in df.columns:\n",
+    "    print(\"\\n\" + \"=\"*80)\n",
+    "    print(\"PROXY USAGE INSIGHTS BY ATTACK TYPE\")\n",
+    "    print(\"=\"*80)\n",
+    "    \n",
+    "    for attack_type in df['Attack Type'].unique():\n",
+    "        subset = df[df['Attack Type'] == attack_type]\n",
+    "        proxy_pct = (subset['has_proxy'].sum() / len(subset)) * 100\n",
+    "        \n",
+    "        print(f\"\\n{attack_type}:\")\n",
+    "        print(f\"  - Total attacks: {len(subset):,}\")\n",
+    "        print(f\"  - With proxy: {subset['has_proxy'].sum():,} ({proxy_pct:.2f}%)\")\n",
+    "        print(f\"  - Without proxy: {len(subset) - subset['has_proxy'].sum():,} ({100-proxy_pct:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 📊 Key Findings - Proxy Analysis\n",
+    "\n",
+    "**Summary:**\n",
+    "- Write your key findings here after running the cells above\n",
+    "- Which attack types use proxies most?\n",
+    "- Is there a correlation with severity?\n",
+    "- Any temporal patterns?\n",
+    "\n",
+    "**Recommendation for ML Model:**\n",
+    "- The binary feature `has_proxy` appears to be a strong discriminator\n",
+    "- Consider as a key feature in your model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. PART 2: IP Trends and Spoofing Detection\n",
+    "\n",
+    "**Analysis Goals:**\n",
+    "1. Identify top source and destination IPs\n",
+    "2. Detect fan-out patterns (one source → many destinations = scanning/spoofing)\n",
+    "3. Detect fan-in patterns (many sources → one destination = DDoS)\n",
+    "4. Analyze bidirectional traffic\n",
+    "5. Detect private IP usage anomalies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*80)\n",
+    "print(\"IP TRENDS AND SPOOFING DETECTION\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "# Basic IP statistics\n",
+    "if 'Source IP Address' in df.columns and 'Destination IP Address' in df.columns:\n",
+    "    print(f\"\\nIP Address Statistics:\")\n",
+    "    print(f\"  - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n",
+    "    print(f\"  - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n",
+    "    print(f\"  - Total IP-to-IP connections: {len(df):,}\")\n",
+    "    print(f\"  - Average connections per source IP: {len(df)/df['Source IP Address'].nunique():.2f}\")\n",
+    "    print(f\"  - Average connections per destination IP: {len(df)/df['Destination IP Address'].nunique():.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Top Source IPs\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"TOP SOURCE IP ADDRESSES\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "top_src_ips = df['Source IP Address'].value_counts().head(20)\n",
+    "print(\"\\nTop 20 Source IPs:\")\n",
+    "print(top_src_ips)\n",
+    "\n",
+    "# Visualize\n",
+    "plt.figure(figsize=(12, 8))\n",
+    "plt.barh(range(len(top_src_ips)), top_src_ips.values, color='steelblue')\n",
+    "plt.yticks(range(len(top_src_ips)), top_src_ips.index)\n",
+    "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n",
+    "plt.ylabel('Source IP Address', fontsize=12)\n",
+    "plt.title('Top 20 Most Active Source IP Addresses', fontsize=14, fontweight='bold')\n",
+    "plt.gca().invert_yaxis()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Top Destination IPs\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"TOP DESTINATION IP ADDRESSES\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "top_dst_ips = df['Destination IP Address'].value_counts().head(20)\n",
+    "print(\"\\nTop 20 Destination IPs:\")\n",
+    "print(top_dst_ips)\n",
+    "\n",
+    "# Visualize\n",
+    "plt.figure(figsize=(12, 8))\n",
+    "plt.barh(range(len(top_dst_ips)), top_dst_ips.values, color='coral')\n",
+    "plt.yticks(range(len(top_dst_ips)), top_dst_ips.index)\n",
+    "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n",
+    "plt.ylabel('Destination IP Address', fontsize=12)\n",
+    "plt.title('Top 20 Most Targeted Destination IP Addresses', fontsize=14, fontweight='bold')\n",
+    "plt.gca().invert_yaxis()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SPOOFING DETECTION 1: Fan-out Analysis (Source IP → Multiple Destinations)\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"SPOOFING INDICATOR 1: FAN-OUT PATTERN (Source → Multiple Destinations)\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "# Count unique destinations per source IP\n",
+    "src_to_dst_mapping = df.groupby('Source IP Address')['Destination IP Address'].nunique()\n",
+    "src_to_dst_mapping = src_to_dst_mapping.sort_values(ascending=False)\n",
+    "\n",
+    "# Calculate thresholds\n",
+    "threshold_95 = src_to_dst_mapping.quantile(0.95)\n",
+    "threshold_99 = src_to_dst_mapping.quantile(0.99)\n",
+    "\n",
+    "suspicious_sources_95 = src_to_dst_mapping[src_to_dst_mapping > threshold_95]\n",
+    "suspicious_sources_99 = src_to_dst_mapping[src_to_dst_mapping > threshold_99]\n",
+    "\n",
+    "print(f\"\\nFan-out Statistics:\")\n",
+    "print(f\"  - Mean destinations per source: {src_to_dst_mapping.mean():.2f}\")\n",
+    "print(f\"  - Median destinations per source: {src_to_dst_mapping.median():.2f}\")\n",
+    "print(f\"  - 95th percentile threshold: {threshold_95:.0f} destinations\")\n",
+    "print(f\"  - 99th percentile threshold: {threshold_99:.0f} destinations\")\n",
+    "print(f\"\\nSuspicious Source IPs:\")\n",
+    "print(f\"  - IPs above 95th percentile: {len(suspicious_sources_95)} ({len(suspicious_sources_95)/len(src_to_dst_mapping)*100:.2f}%)\")\n",
+    "print(f\"  - IPs above 99th percentile: {len(suspicious_sources_99)} ({len(suspicious_sources_99)/len(src_to_dst_mapping)*100:.2f}%)\")\n",
+    "\n",
+    "print(f\"\\nTop 10 Source IPs with Highest Fan-out:\")\n",
+    "print(src_to_dst_mapping.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize fan-out distribution\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Histogram\n",
+    "axes[0].hist(src_to_dst_mapping.values, bins=50, color='red', alpha=0.7, edgecolor='black')\n",
+    "axes[0].axvline(threshold_95, color='darkred', linestyle='--', linewidth=2, \n",
+    "                label=f'95th percentile: {threshold_95:.0f}')\n",
+    "axes[0].axvline(threshold_99, color='maroon', linestyle='--', linewidth=2, \n",
+    "                label=f'99th percentile: {threshold_99:.0f}')\n",
+    "axes[0].set_xlabel('Number of Unique Destinations per Source IP', fontsize=12)\n",
+    "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n",
+    "axes[0].set_title('Source IP Fan-out Distribution\\n(Potential Scanning/Spoofing)', \n",
+    "                  fontsize=14, fontweight='bold')\n",
+    "axes[0].set_yscale('log')\n",
+    "axes[0].legend()\n",
+    "axes[0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# Top suspicious IPs\n",
+    "top_suspicious = src_to_dst_mapping.head(15)\n",
+    "axes[1].barh(range(len(top_suspicious)), top_suspicious.values, color='darkred')\n",
+    "axes[1].set_yticks(range(len(top_suspicious)))\n",
+    "axes[1].set_yticklabels(top_suspicious.index)\n",
+    "axes[1].set_xlabel('Number of Unique Destinations', fontsize=12)\n",
+    "axes[1].set_ylabel('Source IP Address', fontsize=12)\n",
+    "axes[1].set_title('Top 15 Source IPs by Fan-out\\n(Most Suspicious)', \n",
+    "                  fontsize=14, fontweight='bold')\n",
+    "axes[1].invert_yaxis()\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SPOOFING DETECTION 2: Fan-in Analysis (Multiple Sources → Single Destination)\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"SPOOFING INDICATOR 2: FAN-IN PATTERN (Multiple Sources → Destination)\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "# Count unique sources per destination IP\n",
+    "dst_to_src_mapping = df.groupby('Destination IP Address')['Source IP Address'].nunique()\n",
+    "dst_to_src_mapping = dst_to_src_mapping.sort_values(ascending=False)\n",
+    "\n",
+    "# Calculate thresholds\n",
+    "threshold_95_dst = dst_to_src_mapping.quantile(0.95)\n",
+    "threshold_99_dst = dst_to_src_mapping.quantile(0.99)\n",
+    "\n",
+    "suspicious_targets_95 = dst_to_src_mapping[dst_to_src_mapping > threshold_95_dst]\n",
+    "suspicious_targets_99 = dst_to_src_mapping[dst_to_src_mapping > threshold_99_dst]\n",
+    "\n",
+    "print(f\"\\nFan-in Statistics:\")\n",
+    "print(f\"  - Mean sources per destination: {dst_to_src_mapping.mean():.2f}\")\n",
+    "print(f\"  - Median sources per destination: {dst_to_src_mapping.median():.2f}\")\n",
+    "print(f\"  - 95th percentile threshold: {threshold_95_dst:.0f} sources\")\n",
+    "print(f\"  - 99th percentile threshold: {threshold_99_dst:.0f} sources\")\n",
+    "print(f\"\\nSuspicious Target IPs (Potential DDoS Victims):\")\n",
+    "print(f\"  - IPs above 95th percentile: {len(suspicious_targets_95)} ({len(suspicious_targets_95)/len(dst_to_src_mapping)*100:.2f}%)\")\n",
+    "print(f\"  - IPs above 99th percentile: {len(suspicious_targets_99)} ({len(suspicious_targets_99)/len(dst_to_src_mapping)*100:.2f}%)\")\n",
+    "\n",
+    "print(f\"\\nTop 10 Target IPs with Highest Fan-in:\")\n",
+    "print(dst_to_src_mapping.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize fan-in distribution\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Histogram\n",
+    "axes[0].hist(dst_to_src_mapping.values, bins=50, color='purple', alpha=0.7, edgecolor='black')\n",
+    "axes[0].axvline(threshold_95_dst, color='darkviolet', linestyle='--', linewidth=2, \n",
+    "                label=f'95th percentile: {threshold_95_dst:.0f}')\n",
+    "axes[0].axvline(threshold_99_dst, color='indigo', linestyle='--', linewidth=2, \n",
+    "                label=f'99th percentile: {threshold_99_dst:.0f}')\n",
+    "axes[0].set_xlabel('Number of Unique Sources per Destination IP', fontsize=12)\n",
+    "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n",
+    "axes[0].set_title('Destination IP Fan-in Distribution\\n(Potential DDoS Targets)', \n",
+    "                  fontsize=14, fontweight='bold')\n",
+    "axes[0].set_yscale('log')\n",
+    "axes[0].legend()\n",
+    "axes[0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# Top targeted IPs\n",
+    "top_targets = dst_to_src_mapping.head(15)\n",
+    "axes[1].barh(range(len(top_targets)), top_targets.values, color='darkviolet')\n",
+    "axes[1].set_yticks(range(len(top_targets)))\n",
+    "axes[1].set_yticklabels(top_targets.index)\n",
+    "axes[1].set_xlabel('Number of Unique Sources', fontsize=12)\n",
+    "axes[1].set_ylabel('Destination IP Address', fontsize=12)\n",
+    "axes[1].set_title('Top 15 Destination IPs by Fan-in\\n(Potential DDoS Targets)', \n",
+    "                  fontsize=14, fontweight='bold')\n",
+    "axes[1].invert_yaxis()\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SPOOFING DETECTION 3: Bidirectional Traffic Analysis\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"SPOOFING INDICATOR 3: BIDIRECTIONAL TRAFFIC\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "source_ips_set = set(df['Source IP Address'].dropna())\n",
+    "dest_ips_set = set(df['Destination IP Address'].dropna())\n",
+    "bidirectional_ips = source_ips_set.intersection(dest_ips_set)\n",
+    "\n",
+    "print(f\"\\nBidirectional IP Statistics:\")\n",
+    "print(f\"  - Total unique source IPs: {len(source_ips_set):,}\")\n",
+    "print(f\"  - Total unique destination IPs: {len(dest_ips_set):,}\")\n",
+    "print(f\"  - IPs appearing as BOTH source and destination: {len(bidirectional_ips):,}\")\n",
+    "print(f\"  - Percentage of bidirectional IPs: {len(bidirectional_ips)/(len(source_ips_set.union(dest_ips_set)))*100:.2f}%\")\n",
+    "\n",
+    "# Analyze bidirectional traffic by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \\\n",
+    "                              (df['Destination IP Address'].isin(bidirectional_ips))\n",
+    "    \n",
+    "    print(f\"\\nBidirectional Traffic by Attack Type:\")\n",
+    "    bidir_attack = pd.crosstab(df['Attack Type'], df['is_bidirectional'], normalize='index') * 100\n",
+    "    print(bidir_attack)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize bidirectional traffic\n",
+    "if 'Attack Type' in df.columns and 'is_bidirectional' in df.columns:\n",
+    "    plt.figure(figsize=(12, 6))\n",
+    "    bidir_attack[True].sort_values().plot(kind='barh', color='teal', edgecolor='black')\n",
+    "    plt.xlabel('Percentage of Traffic with Bidirectional IPs', fontsize=12)\n",
+    "    plt.ylabel('Attack Type', fontsize=12)\n",
+    "    plt.title('Bidirectional IP Traffic by Attack Type', fontsize=14, fontweight='bold')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SPOOFING DETECTION 4: Private IP Detection\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"SPOOFING INDICATOR 4: PRIVATE IP ADDRESS DETECTION\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "def is_private_ip(ip):\n",
+    "    \"\"\"Check if an IP is in private range (RFC 1918)\"\"\"\n",
+    "    if pd.isna(ip):\n",
+    "        return False\n",
+    "    try:\n",
+    "        parts = str(ip).split('.')\n",
+    "        if len(parts) != 4:\n",
+    "            return False\n",
+    "        first = int(parts[0])\n",
+    "        second = int(parts[1])\n",
+    "        \n",
+    "        # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x\n",
+    "        if first == 10:\n",
+    "            return True\n",
+    "        if first == 172 and 16 <= second <= 31:\n",
+    "            return True\n",
+    "        if first == 192 and second == 168:\n",
+    "            return True\n",
+    "        return False\n",
+    "    except:\n",
+    "        return False\n",
+    "\n",
+    "df['src_is_private'] = df['Source IP Address'].apply(is_private_ip)\n",
+    "df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip)\n",
+    "\n",
+    "print(f\"\\nPrivate IP Statistics:\")\n",
+    "print(f\"  - Source IPs from private ranges: {df['src_is_private'].sum():,} ({df['src_is_private'].sum()/len(df)*100:.2f}%)\")\n",
+    "print(f\"  - Destination IPs from private ranges: {df['dst_is_private'].sum():,} ({df['dst_is_private'].sum()/len(df)*100:.2f}%)\")\n",
+    "print(f\"  - Total connections involving private IPs: {(df['src_is_private'] | df['dst_is_private']).sum():,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze private IP usage by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    print(\"\\nPrivate IP Usage by Attack Type:\")\n",
+    "    attack_private = df.groupby('Attack Type').agg({\n",
+    "        'src_is_private': ['sum', 'mean'],\n",
+    "        'dst_is_private': ['sum', 'mean']\n",
+    "    })\n",
+    "    attack_private.columns = ['Src_Private_Count', 'Src_Private_Pct', 'Dst_Private_Count', 'Dst_Private_Pct']\n",
+    "    attack_private['Src_Private_Pct'] = attack_private['Src_Private_Pct'] * 100\n",
+    "    attack_private['Dst_Private_Pct'] = attack_private['Dst_Private_Pct'] * 100\n",
+    "    print(attack_private)\n",
+    "    \n",
+    "    # Visualize\n",
+    "    fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "    \n",
+    "    attack_private['Src_Private_Pct'].plot(kind='bar', ax=axes[0], color='orange', edgecolor='black')\n",
+    "    axes[0].set_title('Source Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n",
+    "    axes[0].set_xlabel('Attack Type')\n",
+    "    axes[0].set_ylabel('Percentage')\n",
+    "    axes[0].tick_params(axis='x', rotation=45)\n",
+    "    \n",
+    "    attack_private['Dst_Private_Pct'].plot(kind='bar', ax=axes[1], color='red', edgecolor='black')\n",
+    "    axes[1].set_title('Destination Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n",
+    "    axes[1].set_xlabel('Attack Type')\n",
+    "    axes[1].set_ylabel('Percentage')\n",
+    "    axes[1].tick_params(axis='x', rotation=45)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Geo-location analysis (if available)\n",
+    "if 'Geo-location Data' in df.columns:\n",
+    "    print(\"\\n\" + \"=\"*80)\n",
+    "    print(\"GEO-LOCATION ANALYSIS\")\n",
+    "    print(\"=\"*80)\n",
+    "    \n",
+    "    print(f\"\\nGeo-location Statistics:\")\n",
+    "    print(f\"  - Unique locations: {df['Geo-location Data'].nunique():,}\")\n",
+    "    print(f\"  - Missing values: {df['Geo-location Data'].isna().sum():,}\")\n",
+    "    \n",
+    "    # Top locations\n",
+    "    print(f\"\\nTop 15 Geo-locations:\")\n",
+    "    top_locations = df['Geo-location Data'].value_counts().head(15)\n",
+    "    print(top_locations)\n",
+    "    \n",
+    "    # Visualize\n",
+    "    plt.figure(figsize=(14, 8))\n",
+    "    top_locations.plot(kind='barh', color='skyblue', edgecolor='black')\n",
+    "    plt.xlabel('Frequency', fontsize=12)\n",
+    "    plt.ylabel('Geo-location', fontsize=12)\n",
+    "    plt.title('Top 15 Geo-locations in Attack Traffic', fontsize=14, fontweight='bold')\n",
+    "    plt.gca().invert_yaxis()\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "    # Geo-location by attack type\n",
+    "    if 'Attack Type' in df.columns:\n",
+    "        print(f\"\\nTop Geo-location by Attack Type:\")\n",
+    "        for attack in df['Attack Type'].unique():\n",
+    "            top_loc = df[df['Attack Type'] == attack]['Geo-location Data'].value_counts().head(1)\n",
+    "            if len(top_loc) > 0:\n",
+    "                print(f\"  {attack}: {top_loc.index[0]} ({top_loc.values[0]} occurrences)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 📊 Key Findings - IP Trends & Spoofing\n",
+    "\n",
+    "**Summary:**\n",
+    "- Write your key findings here\n",
+    "- How many suspicious IPs detected (fan-out/fan-in)?\n",
+    "- Any DDoS targets identified?\n",
+    "- Private IP issues?\n",
+    "- Geographic patterns?\n",
+    "\n",
+    "**Red Flags Identified:**\n",
+    "- List specific suspicious IPs or patterns\n",
+    "\n",
+    "**Recommendation for ML Model:**\n",
+    "- Create features: source_fanout_score, dest_fanin_score, is_bidirectional, is_private_ip"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. PART 3: Data Bin Trends Analysis\n",
+    "\n",
+    "**Analysis Goals:**\n",
+    "1. Packet Length distribution and binning\n",
+    "2. Port usage patterns (well-known, registered, dynamic)\n",
+    "3. Protocol distribution\n",
+    "4. Anomaly score categorization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*80)\n",
+    "print(\"DATA BIN TRENDS ANALYSIS\")\n",
+    "print(\"=\"*80)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. PACKET LENGTH ANALYSIS\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"PACKET LENGTH DISTRIBUTION\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "if 'Packet Length' in df.columns:\n",
+    "    packet_lengths = df['Packet Length'].dropna()\n",
+    "    \n",
+    "    print(f\"\\nPacket Length Statistics:\")\n",
+    "    print(f\"  - Mean: {packet_lengths.mean():.2f} bytes\")\n",
+    "    print(f\"  - Median: {packet_lengths.median():.2f} bytes\")\n",
+    "    print(f\"  - Std Dev: {packet_lengths.std():.2f} bytes\")\n",
+    "    print(f\"  - Min: {packet_lengths.min():.2f} bytes\")\n",
+    "    print(f\"  - Max: {packet_lengths.max():.2f} bytes\")\n",
+    "    print(f\"  - 25th percentile: {packet_lengths.quantile(0.25):.2f} bytes\")\n",
+    "    print(f\"  - 75th percentile: {packet_lengths.quantile(0.75):.2f} bytes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create packet length bins\n",
+    "bins_packet = [0, 64, 128, 256, 512, 1024, 2048, float('inf')]\n",
+    "labels_packet = ['0-64', '64-128', '128-256', '256-512', '512-1024', '1024-2048', '2048+']\n",
+    "df['packet_length_bin'] = pd.cut(df['Packet Length'], bins=bins_packet, labels=labels_packet)\n",
+    "\n",
+    "packet_bin_dist = df['packet_length_bin'].value_counts().sort_index()\n",
+    "print(f\"\\nPacket Length Bins Distribution:\")\n",
+    "print(packet_bin_dist)\n",
+    "print(f\"\\nPercentage Distribution:\")\n",
+    "print((packet_bin_dist / packet_bin_dist.sum() * 100).round(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize packet length\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
+    "\n",
+    "# Histogram\n",
+    "axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)\n",
+    "axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11)\n",
+    "axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11)\n",
+    "axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold')\n",
+    "axes[0, 0].set_yscale('log')\n",
+    "axes[0, 0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# Binned distribution\n",
+    "packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')\n",
+    "axes[0, 1].set_xlabel('Packet Length Bins (bytes)', fontsize=11)\n",
+    "axes[0, 1].set_ylabel('Count', fontsize=11)\n",
+    "axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold')\n",
+    "axes[0, 1].tick_params(axis='x', rotation=45)\n",
+    "\n",
+    "# Box plot by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0])\n",
+    "    axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n",
+    "    axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11)\n",
+    "    axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold')\n",
+    "    axes[1, 0].get_figure().suptitle('')  # Remove default title\n",
+    "    plt.sca(axes[1, 0])\n",
+    "    plt.xticks(rotation=45, ha='right')\n",
+    "\n",
+    "# Bins by attack type (stacked bar)\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100\n",
+    "    bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10')\n",
+    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
+    "    axes[1, 1].set_ylabel('Percentage', fontsize=11)\n",
+    "    axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold')\n",
+    "    axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. PORT ANALYSIS\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"PORT USAGE ANALYSIS\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "if 'Source Port' in df.columns and 'Destination Port' in df.columns:\n",
+    "    # Source ports\n",
+    "    print(f\"\\nTop 10 Source Ports:\")\n",
+    "    top_src_ports = df['Source Port'].value_counts().head(10)\n",
+    "    print(top_src_ports)\n",
+    "    \n",
+    "    # Destination ports\n",
+    "    print(f\"\\nTop 10 Destination Ports:\")\n",
+    "    top_dst_ports = df['Destination Port'].value_counts().head(10)\n",
+    "    print(top_dst_ports)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create port categories\n",
+    "def categorize_port(port):\n",
+    "    \"\"\"Categorize ports into well-known, registered, or dynamic\"\"\"\n",
+    "    if pd.isna(port):\n",
+    "        return 'Unknown'\n",
+    "    try:\n",
+    "        port = int(port)\n",
+    "        if 0 <= port <= 1023:\n",
+    "            return 'Well-known (0-1023)'\n",
+    "        elif 1024 <= port <= 49151:\n",
+    "            return 'Registered (1024-49151)'\n",
+    "        elif 49152 <= port <= 65535:\n",
+    "            return 'Dynamic (49152-65535)'\n",
+    "        else:\n",
+    "            return 'Unknown'\n",
+    "    except:\n",
+    "        return 'Unknown'\n",
+    "\n",
+    "df['dst_port_category'] = df['Destination Port'].apply(categorize_port)\n",
+    "df['src_port_category'] = df['Source Port'].apply(categorize_port)\n",
+    "\n",
+    "print(f\"\\nDestination Port Categories:\")\n",
+    "print(df['dst_port_category'].value_counts())\n",
+    "print(f\"\\nPercentage:\")\n",
+    "print((df['dst_port_category'].value_counts() / len(df) * 100).round(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize port analysis\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
+    "\n",
+    "# Top source ports\n",
+    "top_src_ports_15 = df['Source Port'].value_counts().head(15)\n",
+    "axes[0, 0].barh(range(len(top_src_ports_15)), top_src_ports_15.values, color='lightgreen')\n",
+    "axes[0, 0].set_yticks(range(len(top_src_ports_15)))\n",
+    "axes[0, 0].set_yticklabels(top_src_ports_15.index)\n",
+    "axes[0, 0].set_xlabel('Frequency', fontsize=11)\n",
+    "axes[0, 0].set_ylabel('Port Number', fontsize=11)\n",
+    "axes[0, 0].set_title('Top 15 Source Ports', fontsize=13, fontweight='bold')\n",
+    "axes[0, 0].invert_yaxis()\n",
+    "\n",
+    "# Top destination ports\n",
+    "top_dst_ports_15 = df['Destination Port'].value_counts().head(15)\n",
+    "axes[0, 1].barh(range(len(top_dst_ports_15)), top_dst_ports_15.values, color='lightcoral')\n",
+    "axes[0, 1].set_yticks(range(len(top_dst_ports_15)))\n",
+    "axes[0, 1].set_yticklabels(top_dst_ports_15.index)\n",
+    "axes[0, 1].set_xlabel('Frequency', fontsize=11)\n",
+    "axes[0, 1].set_ylabel('Port Number', fontsize=11)\n",
+    "axes[0, 1].set_title('Top 15 Destination Ports', fontsize=13, fontweight='bold')\n",
+    "axes[0, 1].invert_yaxis()\n",
+    "\n",
+    "# Port category pie chart\n",
+    "port_cat_dist = df['dst_port_category'].value_counts()\n",
+    "axes[1, 0].pie(port_cat_dist.values, labels=port_cat_dist.index, autopct='%1.1f%%', startangle=90)\n",
+    "axes[1, 0].set_title('Destination Port Categories', fontsize=13, fontweight='bold')\n",
+    "\n",
+    "# Port categories by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    port_attack = pd.crosstab(df['Attack Type'], df['dst_port_category'])\n",
+    "    port_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='Set3')\n",
+    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
+    "    axes[1, 1].set_ylabel('Count', fontsize=11)\n",
+    "    axes[1, 1].set_title('Port Categories by Attack Type', fontsize=13, fontweight='bold')\n",
+    "    axes[1, 1].legend(title='Port Category', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. PROTOCOL ANALYSIS\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"PROTOCOL DISTRIBUTION\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "if 'Protocol' in df.columns:\n",
+    "    protocol_dist = df['Protocol'].value_counts()\n",
+    "    print(f\"\\nProtocol Distribution:\")\n",
+    "    print(protocol_dist)\n",
+    "    print(f\"\\nPercentage:\")\n",
+    "    print((protocol_dist / protocol_dist.sum() * 100).round(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize protocol analysis\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Protocol pie chart\n",
+    "axes[0].pie(protocol_dist.values, labels=protocol_dist.index, autopct='%1.1f%%', startangle=90)\n",
+    "axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold')\n",
+    "\n",
+    "# Protocol by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100\n",
+    "    protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis')\n",
+    "    axes[1].set_xlabel('Attack Type', fontsize=12)\n",
+    "    axes[1].set_ylabel('Percentage', fontsize=12)\n",
+    "    axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold')\n",
+    "    axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "    axes[1].tick_params(axis='x', rotation=45)\n",
+    "    \n",
+    "    print(f\"\\nProtocol Usage by Attack Type (%):\")\n",
+    "    print(protocol_attack.round(2))\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. ANOMALY SCORES ANALYSIS\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"ANOMALY SCORES DISTRIBUTION\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "if 'Anomaly Scores' in df.columns:\n",
+    "    anomaly_scores = df['Anomaly Scores'].dropna()\n",
+    "    \n",
+    "    print(f\"\\nAnomaly Score Statistics:\")\n",
+    "    print(f\"  - Mean: {anomaly_scores.mean():.4f}\")\n",
+    "    print(f\"  - Median: {anomaly_scores.median():.4f}\")\n",
+    "    print(f\"  - Std Dev: {anomaly_scores.std():.4f}\")\n",
+    "    print(f\"  - Min: {anomaly_scores.min():.4f}\")\n",
+    "    print(f\"  - Max: {anomaly_scores.max():.4f}\")\n",
+    "    print(f\"  - 25th percentile: {anomaly_scores.quantile(0.25):.4f}\")\n",
+    "    print(f\"  - 50th percentile: {anomaly_scores.quantile(0.50):.4f}\")\n",
+    "    print(f\"  - 75th percentile: {anomaly_scores.quantile(0.75):.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create anomaly score categories based on quartiles\n",
+    "anomaly_bins = [anomaly_scores.min(), \n",
+    "                anomaly_scores.quantile(0.25),\n",
+    "                anomaly_scores.quantile(0.5),\n",
+    "                anomaly_scores.quantile(0.75),\n",
+    "                anomaly_scores.max()]\n",
+    "anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)']\n",
+    "\n",
+    "df['anomaly_category'] = pd.cut(df['Anomaly Scores'], bins=anomaly_bins, \n",
+    "                                 labels=anomaly_labels, include_lowest=True)\n",
+    "\n",
+    "anomaly_cat_dist = df['anomaly_category'].value_counts().sort_index()\n",
+    "print(f\"\\nAnomaly Score Categories:\")\n",
+    "print(anomaly_cat_dist)\n",
+    "print(f\"\\nPercentage:\")\n",
+    "print((anomaly_cat_dist / anomaly_cat_dist.sum() * 100).round(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize anomaly scores\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
+    "\n",
+    "# Histogram\n",
+    "axes[0, 0].hist(anomaly_scores, bins=50, color='purple', alpha=0.7, edgecolor='black')\n",
+    "axes[0, 0].set_xlabel('Anomaly Score', fontsize=11)\n",
+    "axes[0, 0].set_ylabel('Frequency', fontsize=11)\n",
+    "axes[0, 0].set_title('Anomaly Score Distribution', fontsize=13, fontweight='bold')\n",
+    "axes[0, 0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# Category bar chart\n",
+    "colors = ['green', 'yellow', 'orange', 'red']\n",
+    "axes[0, 1].bar(range(len(anomaly_cat_dist)), anomaly_cat_dist.values, \n",
+    "               color=colors, edgecolor='black')\n",
+    "axes[0, 1].set_xticks(range(len(anomaly_cat_dist)))\n",
+    "axes[0, 1].set_xticklabels(anomaly_cat_dist.index, rotation=45, ha='right')\n",
+    "axes[0, 1].set_xlabel('Anomaly Category', fontsize=11)\n",
+    "axes[0, 1].set_ylabel('Count', fontsize=11)\n",
+    "axes[0, 1].set_title('Anomaly Score Categories', fontsize=13, fontweight='bold')\n",
+    "\n",
+    "# Box plot by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    df.boxplot(column='Anomaly Scores', by='Attack Type', ax=axes[1, 0])\n",
+    "    axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n",
+    "    axes[1, 0].set_ylabel('Anomaly Score', fontsize=11)\n",
+    "    axes[1, 0].set_title('Anomaly Scores by Attack Type', fontsize=13, fontweight='bold')\n",
+    "    axes[1, 0].get_figure().suptitle('')\n",
+    "    plt.sca(axes[1, 0])\n",
+    "    plt.xticks(rotation=45, ha='right')\n",
+    "\n",
+    "# Category by attack type\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    anomaly_attack = pd.crosstab(df['Attack Type'], df['anomaly_category'], normalize='index') * 100\n",
+    "    anomaly_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], color=colors)\n",
+    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
+    "    axes[1, 1].set_ylabel('Percentage', fontsize=11)\n",
+    "    axes[1, 1].set_title('Anomaly Categories by Attack Type (%)', fontsize=13, fontweight='bold')\n",
+    "    axes[1, 1].legend(title='Anomaly Level', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 📊 Key Findings - Data Bin Trends\n",
+    "\n",
+    "**Summary:**\n",
+    "- Write your key findings here\n",
+    "- What are the dominant packet sizes per attack type?\n",
+    "- Which ports are most targeted?\n",
+    "- Protocol preferences?\n",
+    "- Anomaly score patterns?\n",
+    "\n",
+    "**Attack Signatures Identified:**\n",
+    "- DDoS: [packet size pattern, protocol, ports]\n",
+    "- Malware: [packet size pattern, protocol, ports]\n",
+    "- etc.\n",
+    "\n",
+    "**Recommendation for ML Model:**\n",
+    "- Use binned features: packet_length_bin, port_category, anomaly_category"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. COMPREHENSIVE SUMMARY & INSIGHTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"COMPREHENSIVE EDA SUMMARY\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "print(f\"\\n📊 DATASET OVERVIEW\")\n",
+    "print(\"-\" * 80)\n",
+    "print(f\"Total Records: {len(df):,}\")\n",
+    "print(f\"Total Features: {df.shape[1]}\")\n",
+    "print(f\"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n",
+    "\n",
+    "if 'Attack Type' in df.columns:\n",
+    "    print(f\"\\n🎯 ATTACK TYPE DISTRIBUTION\")\n",
+    "    print(\"-\" * 80)\n",
+    "    attack_dist = df['Attack Type'].value_counts()\n",
+    "    for attack, count in attack_dist.items():\n",
+    "        print(f\"  {attack}: {count:,} ({count/len(df)*100:.2f}%)\")\n",
+    "\n",
+    "print(f\"\\n🔍 KEY STATISTICS\")\n",
+    "print(\"-\" * 80)\n",
+    "\n",
+    "# Proxy\n",
+    "if 'has_proxy' in df.columns:\n",
+    "    proxy_pct = (df['has_proxy'].sum() / len(df)) * 100\n",
+    "    print(f\"  - Proxy Usage Rate: {proxy_pct:.2f}%\")\n",
+    "\n",
+    "# IPs\n",
+    "if 'Source IP Address' in df.columns:\n",
+    "    print(f\"  - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n",
+    "    print(f\"  - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n",
+    "\n",
+    "# Packet Length\n",
+    "if 'Packet Length' in df.columns:\n",
+    "    print(f\"  - Average Packet Size: {df['Packet Length'].mean():.2f} bytes\")\n",
+    "\n",
+    "# Protocol\n",
+    "if 'Protocol' in df.columns:\n",
+    "    top_protocol = df['Protocol'].value_counts().index[0]\n",
+    "    top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100\n",
+    "    print(f\"  - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)\")\n",
+    "\n",
+    "# Port\n",
+    "if 'Destination Port' in df.columns:\n",
+    "    top_port = df['Destination Port'].value_counts().index[0]\n",
+    "    top_port_count = df['Destination Port'].value_counts().values[0]\n",
+    "    print(f\"  - Most Targeted Port: {top_port} ({top_port_count:,} times)\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*80)\n",
+    "print(\"✅ EDA ANALYSIS COMPLETE!\")\n",
+    "print(\"=\"*80)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 6. EXPORT ENGINEERED FEATURES (Optional)\n",
+    "\n",
+    "Create new features based on EDA insights for ML model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a summary of engineered features\n",
+    "engineered_features = df[[\n",
+    "    'has_proxy',                    # Binary: 0/1\n",
+    "    'is_bidirectional',            # Binary: 0/1  \n",
+    "    'src_is_private',              # Binary: 0/1\n",
+    "    'dst_is_private',              # Binary: 0/1\n",
+    "    'packet_length_bin',           # Categorical: 7 categories\n",
+    "    'dst_port_category',           # Categorical: 3 categories\n",
+    "    'src_port_category',           # Categorical: 3 categories\n",
+    "    'anomaly_category'             # Categorical: 4 categories\n",
+    "]].copy()\n",
+    "\n",
+    "print(\"Engineered Features Summary:\")\n",
+    "print(engineered_features.head(10))\n",
+    "print(f\"\\nShape: {engineered_features.shape}\")\n",
+    "print(f\"\\nFeature Data Types:\")\n",
+    "print(engineered_features.dtypes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional: Save engineered features to CSV\n",
+    "# engineered_features.to_csv('engineered_features.csv', index=False)\n",
+    "# print(\"✓ Engineered features saved to 'engineered_features.csv'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 7. CONCLUSIONS & RECOMMENDATIONS\n",
+    "\n",
+    "### Key Findings:\n",
+    "1. **Proxy Usage:**\n",
+    "   - [Your findings here]\n",
+    "   \n",
+    "2. **IP Spoofing Indicators:**\n",
+    "   - [Your findings here]\n",
+    "   \n",
+    "3. **Data Bin Patterns:**\n",
+    "   - [Your findings here]\n",
+    "\n",
+    "### Recommendations for ML Model:\n",
+    "1. Binary features: `has_proxy`, `is_bidirectional`, `src_is_private`, `dst_is_private`\n",
+    "2. Categorical features: `packet_length_bin`, `port_category`, `anomaly_category`\n",
+    "3. Numerical features: Consider creating fan-out/fan-in scores\n",
+    "4. Attack-specific patterns identified can guide feature importance analysis\n",
+    "\n",
+    "### Next Steps:\n",
+    "1. Data preprocessing (handle missing values, encode categoricals)\n",
+    "2. Feature scaling/normalization\n",
+    "3. Address class imbalance if needed\n",
+    "4. Model selection and training\n",
+    "5. Hyperparameter tuning\n",
+    "6. Model evaluation"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}