diff --git a/cybersecurity_eda.ipynb b/cybersecurity_eda.ipynb new file mode 100644 index 0000000..fae9856 --- /dev/null +++ b/cybersecurity_eda.ipynb @@ -0,0 +1,1271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cybersecurity Attack Type Detection - EDA\n", + "## Focus: Proxy + IP Trends, Spoofing Detection, and Data Bin Trends\n", + "\n", + "**Team Member:** [Your Name] \n", + "**Date:** January 31, 2026 \n", + "**Dataset:** 40,000 rows, 25 features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Setup and Data Loading" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from collections import Counter\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Set visualization style\n", + "sns.set_style(\"whitegrid\")\n", + "plt.rcParams['figure.figsize'] = (15, 8)\n", + "plt.rcParams['font.size'] = 10\n", + "\n", + "print(\"āœ“ Libraries imported successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load dataset\n", + "# TODO: Update the filepath to your actual CSV file location\n", + "df = pd.read_csv('your_dataset.csv')\n", + "\n", + "print(f\"Dataset Shape: {df.shape}\")\n", + "print(f\"Total Records: {df.shape[0]:,}\")\n", + "print(f\"Total Features: {df.shape[1]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display first few rows\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Data types and basic info\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Missing values analysis\n", + "missing_df = pd.DataFrame({\n", + " 'Missing_Count': df.isnull().sum(),\n", + " 'Percentage': (df.isnull().sum() / len(df)) * 100,\n", + " 'Distinct_Count': df.nunique(),\n", + " 'Distinct_Percentage': (df.nunique() / len(df)) * 100\n", + "}).sort_values('Missing_Count', ascending=False)\n", + "\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"MISSING VALUES AND DISTINCTNESS ANALYSIS\")\n", + "print(\"=\"*80)\n", + "print(missing_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Attack Type distribution\n", + "if 'Attack Type' in df.columns:\n", + " print(\"\\nAttack Type Distribution:\")\n", + " print(df['Attack Type'].value_counts())\n", + " \n", + " plt.figure(figsize=(12, 6))\n", + " df['Attack Type'].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')\n", + " plt.title('Attack Type Distribution', fontsize=14, fontweight='bold')\n", + " plt.xlabel('Attack Type')\n", + " plt.ylabel('Count')\n", + " plt.xticks(rotation=45, ha='right')\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 2. PART 1: Proxy Information Analysis\n", + "\n", + "**Key Insights from Data Profiling:**\n", + "- 50% missing values (19,851 out of 40,000)\n", + "- 20,148 distinct values when present (highly diverse)\n", + "- This suggests proxy info is present only for certain attacks/sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Proxy Information Analysis\n", + "print(\"=\"*80)\n", + "print(\"PROXY INFORMATION ANALYSIS\")\n", + "print(\"=\"*80)\n", + "\n", + "if 'Proxy Information' in df.columns:\n", + " # Basic statistics\n", + " total_records = len(df)\n", + " proxy_present = df['Proxy Information'].notna().sum()\n", + " proxy_missing = df['Proxy Information'].isna().sum()\n", + " unique_proxies = df['Proxy Information'].nunique()\n", + " \n", + " print(f\"\\nProxy Information Statistics:\")\n", + " print(f\" - Total records: {total_records:,}\")\n", + " print(f\" - Records WITH proxy info: {proxy_present:,} ({proxy_present/total_records*100:.2f}%)\")\n", + " print(f\" - Records WITHOUT proxy info: {proxy_missing:,} ({proxy_missing/total_records*100:.2f}%)\")\n", + " print(f\" - Unique proxy values: {unique_proxies:,}\")\n", + " \n", + " # Create binary feature: has_proxy\n", + " df['has_proxy'] = df['Proxy Information'].notna().astype(int)\n", + " \n", + " print(f\"\\nProxy Usage Distribution:\")\n", + " print(df['has_proxy'].value_counts())\n", + "else:\n", + " print(\"Warning: 'Proxy Information' column not found!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize proxy usage patterns\n", + "if 'has_proxy' in df.columns:\n", + " fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + " \n", + " # 1. Overall proxy usage pie chart\n", + " proxy_counts = df['has_proxy'].value_counts()\n", + " labels = ['No Proxy', 'With Proxy']\n", + " colors = ['lightcoral', 'lightgreen']\n", + " axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', \n", + " colors=colors, startangle=90)\n", + " axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold')\n", + " \n", + " # 2. Proxy usage by Attack Type\n", + " if 'Attack Type' in df.columns:\n", + " proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100\n", + " proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, \n", + " color=['lightcoral', 'lightgreen'])\n", + " axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold')\n", + " axes[0, 1].set_xlabel('Attack Type')\n", + " axes[0, 1].set_ylabel('Percentage')\n", + " axes[0, 1].legend(['No Proxy', 'With Proxy'])\n", + " axes[0, 1].tick_params(axis='x', rotation=45)\n", + " \n", + " # Print statistical summary\n", + " print(\"\\nProxy Usage by Attack Type:\")\n", + " print(proxy_attack)\n", + " \n", + " # 3. Proxy usage by Severity Level\n", + " if 'Severity Level' in df.columns:\n", + " proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy'])\n", + " proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen'])\n", + " axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold')\n", + " axes[1, 0].set_xlabel('Severity Level')\n", + " axes[1, 0].set_ylabel('Count')\n", + " axes[1, 0].legend(['No Proxy', 'With Proxy'])\n", + " axes[1, 0].tick_params(axis='x', rotation=45)\n", + " \n", + " # 4. Proxy usage over time\n", + " if 'Timestamp' in df.columns:\n", + " df_temp = df.copy()\n", + " df_temp['Timestamp'] = pd.to_datetime(df_temp['Timestamp'], errors='coerce')\n", + " df_temp = df_temp.dropna(subset=['Timestamp'])\n", + " df_temp['Date'] = df_temp['Timestamp'].dt.date\n", + " \n", + " proxy_time = df_temp.groupby('Date')['has_proxy'].agg(['sum', 'count'])\n", + " proxy_time['percentage'] = (proxy_time['sum'] / proxy_time['count']) * 100\n", + " \n", + " axes[1, 1].plot(proxy_time.index, proxy_time['percentage'], \n", + " marker='o', color='steelblue', linewidth=2)\n", + " axes[1, 1].set_title('Proxy Usage Trend Over Time', fontsize=14, fontweight='bold')\n", + " axes[1, 1].set_xlabel('Date')\n", + " axes[1, 1].set_ylabel('Percentage Using Proxy')\n", + " axes[1, 1].tick_params(axis='x', rotation=45)\n", + " axes[1, 1].grid(True, alpha=0.3)\n", + " \n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze relationship with Log Source (Firewall vs Server)\n", + "if 'Log Source' in df.columns and 'has_proxy' in df.columns:\n", + " print(\"\\nProxy Usage by Log Source:\")\n", + " log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100\n", + " print(log_proxy)\n", + " \n", + " # Visualize\n", + " log_proxy.plot(kind='bar', figsize=(10, 6), color=['lightcoral', 'lightgreen'])\n", + " plt.title('Proxy Usage: Firewall vs Server Logs', fontsize=14, fontweight='bold')\n", + " plt.xlabel('Log Source')\n", + " plt.ylabel('Percentage')\n", + " plt.legend(['No Proxy', 'With Proxy'])\n", + " plt.xticks(rotation=0)\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Statistical significance test\n", + "if 'Attack Type' in df.columns and 'has_proxy' in df.columns:\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"PROXY USAGE INSIGHTS BY ATTACK TYPE\")\n", + " print(\"=\"*80)\n", + " \n", + " for attack_type in df['Attack Type'].unique():\n", + " subset = df[df['Attack Type'] == attack_type]\n", + " proxy_pct = (subset['has_proxy'].sum() / len(subset)) * 100\n", + " \n", + " print(f\"\\n{attack_type}:\")\n", + " print(f\" - Total attacks: {len(subset):,}\")\n", + " print(f\" - With proxy: {subset['has_proxy'].sum():,} ({proxy_pct:.2f}%)\")\n", + " print(f\" - Without proxy: {len(subset) - subset['has_proxy'].sum():,} ({100-proxy_pct:.2f}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### šŸ“Š Key Findings - Proxy Analysis\n", + "\n", + "**Summary:**\n", + "- Write your key findings here after running the cells above\n", + "- Which attack types use proxies most?\n", + "- Is there a correlation with severity?\n", + "- Any temporal patterns?\n", + "\n", + "**Recommendation for ML Model:**\n", + "- The binary feature `has_proxy` appears to be a strong discriminator\n", + "- Consider as a key feature in your model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 3. PART 2: IP Trends and Spoofing Detection\n", + "\n", + "**Analysis Goals:**\n", + "1. Identify top source and destination IPs\n", + "2. Detect fan-out patterns (one source → many destinations = scanning/spoofing)\n", + "3. Detect fan-in patterns (many sources → one destination = DDoS)\n", + "4. Analyze bidirectional traffic\n", + "5. Detect private IP usage anomalies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\"*80)\n", + "print(\"IP TRENDS AND SPOOFING DETECTION\")\n", + "print(\"=\"*80)\n", + "\n", + "# Basic IP statistics\n", + "if 'Source IP Address' in df.columns and 'Destination IP Address' in df.columns:\n", + " print(f\"\\nIP Address Statistics:\")\n", + " print(f\" - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n", + " print(f\" - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n", + " print(f\" - Total IP-to-IP connections: {len(df):,}\")\n", + " print(f\" - Average connections per source IP: {len(df)/df['Source IP Address'].nunique():.2f}\")\n", + " print(f\" - Average connections per destination IP: {len(df)/df['Destination IP Address'].nunique():.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Top Source IPs\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"TOP SOURCE IP ADDRESSES\")\n", + "print(\"=\"*80)\n", + "\n", + "top_src_ips = df['Source IP Address'].value_counts().head(20)\n", + "print(\"\\nTop 20 Source IPs:\")\n", + "print(top_src_ips)\n", + "\n", + "# Visualize\n", + "plt.figure(figsize=(12, 8))\n", + "plt.barh(range(len(top_src_ips)), top_src_ips.values, color='steelblue')\n", + "plt.yticks(range(len(top_src_ips)), top_src_ips.index)\n", + "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n", + "plt.ylabel('Source IP Address', fontsize=12)\n", + "plt.title('Top 20 Most Active Source IP Addresses', fontsize=14, fontweight='bold')\n", + "plt.gca().invert_yaxis()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Top Destination IPs\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"TOP DESTINATION IP ADDRESSES\")\n", + "print(\"=\"*80)\n", + "\n", + "top_dst_ips = df['Destination IP Address'].value_counts().head(20)\n", + "print(\"\\nTop 20 Destination IPs:\")\n", + "print(top_dst_ips)\n", + "\n", + "# Visualize\n", + "plt.figure(figsize=(12, 8))\n", + "plt.barh(range(len(top_dst_ips)), top_dst_ips.values, color='coral')\n", + "plt.yticks(range(len(top_dst_ips)), top_dst_ips.index)\n", + "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n", + "plt.ylabel('Destination IP Address', fontsize=12)\n", + "plt.title('Top 20 Most Targeted Destination IP Addresses', fontsize=14, fontweight='bold')\n", + "plt.gca().invert_yaxis()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# SPOOFING DETECTION 1: Fan-out Analysis (Source IP → Multiple Destinations)\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"SPOOFING INDICATOR 1: FAN-OUT PATTERN (Source → Multiple Destinations)\")\n", + "print(\"=\"*80)\n", + "\n", + "# Count unique destinations per source IP\n", + "src_to_dst_mapping = df.groupby('Source IP Address')['Destination IP Address'].nunique()\n", + "src_to_dst_mapping = src_to_dst_mapping.sort_values(ascending=False)\n", + "\n", + "# Calculate thresholds\n", + "threshold_95 = src_to_dst_mapping.quantile(0.95)\n", + "threshold_99 = src_to_dst_mapping.quantile(0.99)\n", + "\n", + "suspicious_sources_95 = src_to_dst_mapping[src_to_dst_mapping > threshold_95]\n", + "suspicious_sources_99 = src_to_dst_mapping[src_to_dst_mapping > threshold_99]\n", + "\n", + "print(f\"\\nFan-out Statistics:\")\n", + "print(f\" - Mean destinations per source: {src_to_dst_mapping.mean():.2f}\")\n", + "print(f\" - Median destinations per source: {src_to_dst_mapping.median():.2f}\")\n", + "print(f\" - 95th percentile threshold: {threshold_95:.0f} destinations\")\n", + "print(f\" - 99th percentile threshold: {threshold_99:.0f} destinations\")\n", + "print(f\"\\nSuspicious Source IPs:\")\n", + "print(f\" - IPs above 95th percentile: {len(suspicious_sources_95)} ({len(suspicious_sources_95)/len(src_to_dst_mapping)*100:.2f}%)\")\n", + "print(f\" - IPs above 99th percentile: {len(suspicious_sources_99)} ({len(suspicious_sources_99)/len(src_to_dst_mapping)*100:.2f}%)\")\n", + "\n", + "print(f\"\\nTop 10 Source IPs with Highest Fan-out:\")\n", + "print(src_to_dst_mapping.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize fan-out distribution\n", + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "# Histogram\n", + "axes[0].hist(src_to_dst_mapping.values, bins=50, color='red', alpha=0.7, edgecolor='black')\n", + "axes[0].axvline(threshold_95, color='darkred', linestyle='--', linewidth=2, \n", + " label=f'95th percentile: {threshold_95:.0f}')\n", + "axes[0].axvline(threshold_99, color='maroon', linestyle='--', linewidth=2, \n", + " label=f'99th percentile: {threshold_99:.0f}')\n", + "axes[0].set_xlabel('Number of Unique Destinations per Source IP', fontsize=12)\n", + "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n", + "axes[0].set_title('Source IP Fan-out Distribution\\n(Potential Scanning/Spoofing)', \n", + " fontsize=14, fontweight='bold')\n", + "axes[0].set_yscale('log')\n", + "axes[0].legend()\n", + "axes[0].grid(True, alpha=0.3)\n", + "\n", + "# Top suspicious IPs\n", + "top_suspicious = src_to_dst_mapping.head(15)\n", + "axes[1].barh(range(len(top_suspicious)), top_suspicious.values, color='darkred')\n", + "axes[1].set_yticks(range(len(top_suspicious)))\n", + "axes[1].set_yticklabels(top_suspicious.index)\n", + "axes[1].set_xlabel('Number of Unique Destinations', fontsize=12)\n", + "axes[1].set_ylabel('Source IP Address', fontsize=12)\n", + "axes[1].set_title('Top 15 Source IPs by Fan-out\\n(Most Suspicious)', \n", + " fontsize=14, fontweight='bold')\n", + "axes[1].invert_yaxis()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# SPOOFING DETECTION 2: Fan-in Analysis (Multiple Sources → Single Destination)\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"SPOOFING INDICATOR 2: FAN-IN PATTERN (Multiple Sources → Destination)\")\n", + "print(\"=\"*80)\n", + "\n", + "# Count unique sources per destination IP\n", + "dst_to_src_mapping = df.groupby('Destination IP Address')['Source IP Address'].nunique()\n", + "dst_to_src_mapping = dst_to_src_mapping.sort_values(ascending=False)\n", + "\n", + "# Calculate thresholds\n", + "threshold_95_dst = dst_to_src_mapping.quantile(0.95)\n", + "threshold_99_dst = dst_to_src_mapping.quantile(0.99)\n", + "\n", + "suspicious_targets_95 = dst_to_src_mapping[dst_to_src_mapping > threshold_95_dst]\n", + "suspicious_targets_99 = dst_to_src_mapping[dst_to_src_mapping > threshold_99_dst]\n", + "\n", + "print(f\"\\nFan-in Statistics:\")\n", + "print(f\" - Mean sources per destination: {dst_to_src_mapping.mean():.2f}\")\n", + "print(f\" - Median sources per destination: {dst_to_src_mapping.median():.2f}\")\n", + "print(f\" - 95th percentile threshold: {threshold_95_dst:.0f} sources\")\n", + "print(f\" - 99th percentile threshold: {threshold_99_dst:.0f} sources\")\n", + "print(f\"\\nSuspicious Target IPs (Potential DDoS Victims):\")\n", + "print(f\" - IPs above 95th percentile: {len(suspicious_targets_95)} ({len(suspicious_targets_95)/len(dst_to_src_mapping)*100:.2f}%)\")\n", + "print(f\" - IPs above 99th percentile: {len(suspicious_targets_99)} ({len(suspicious_targets_99)/len(dst_to_src_mapping)*100:.2f}%)\")\n", + "\n", + "print(f\"\\nTop 10 Target IPs with Highest Fan-in:\")\n", + "print(dst_to_src_mapping.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize fan-in distribution\n", + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "# Histogram\n", + "axes[0].hist(dst_to_src_mapping.values, bins=50, color='purple', alpha=0.7, edgecolor='black')\n", + "axes[0].axvline(threshold_95_dst, color='darkviolet', linestyle='--', linewidth=2, \n", + " label=f'95th percentile: {threshold_95_dst:.0f}')\n", + "axes[0].axvline(threshold_99_dst, color='indigo', linestyle='--', linewidth=2, \n", + " label=f'99th percentile: {threshold_99_dst:.0f}')\n", + "axes[0].set_xlabel('Number of Unique Sources per Destination IP', fontsize=12)\n", + "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n", + "axes[0].set_title('Destination IP Fan-in Distribution\\n(Potential DDoS Targets)', \n", + " fontsize=14, fontweight='bold')\n", + "axes[0].set_yscale('log')\n", + "axes[0].legend()\n", + "axes[0].grid(True, alpha=0.3)\n", + "\n", + "# Top targeted IPs\n", + "top_targets = dst_to_src_mapping.head(15)\n", + "axes[1].barh(range(len(top_targets)), top_targets.values, color='darkviolet')\n", + "axes[1].set_yticks(range(len(top_targets)))\n", + "axes[1].set_yticklabels(top_targets.index)\n", + "axes[1].set_xlabel('Number of Unique Sources', fontsize=12)\n", + "axes[1].set_ylabel('Destination IP Address', fontsize=12)\n", + "axes[1].set_title('Top 15 Destination IPs by Fan-in\\n(Potential DDoS Targets)', \n", + " fontsize=14, fontweight='bold')\n", + "axes[1].invert_yaxis()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# SPOOFING DETECTION 3: Bidirectional Traffic Analysis\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"SPOOFING INDICATOR 3: BIDIRECTIONAL TRAFFIC\")\n", + "print(\"=\"*80)\n", + "\n", + "source_ips_set = set(df['Source IP Address'].dropna())\n", + "dest_ips_set = set(df['Destination IP Address'].dropna())\n", + "bidirectional_ips = source_ips_set.intersection(dest_ips_set)\n", + "\n", + "print(f\"\\nBidirectional IP Statistics:\")\n", + "print(f\" - Total unique source IPs: {len(source_ips_set):,}\")\n", + "print(f\" - Total unique destination IPs: {len(dest_ips_set):,}\")\n", + "print(f\" - IPs appearing as BOTH source and destination: {len(bidirectional_ips):,}\")\n", + "print(f\" - Percentage of bidirectional IPs: {len(bidirectional_ips)/(len(source_ips_set.union(dest_ips_set)))*100:.2f}%\")\n", + "\n", + "# Analyze bidirectional traffic by attack type\n", + "if 'Attack Type' in df.columns:\n", + " df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \\\n", + " (df['Destination IP Address'].isin(bidirectional_ips))\n", + " \n", + " print(f\"\\nBidirectional Traffic by Attack Type:\")\n", + " bidir_attack = pd.crosstab(df['Attack Type'], df['is_bidirectional'], normalize='index') * 100\n", + " print(bidir_attack)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize bidirectional traffic\n", + "if 'Attack Type' in df.columns and 'is_bidirectional' in df.columns:\n", + " plt.figure(figsize=(12, 6))\n", + " bidir_attack[True].sort_values().plot(kind='barh', color='teal', edgecolor='black')\n", + " plt.xlabel('Percentage of Traffic with Bidirectional IPs', fontsize=12)\n", + " plt.ylabel('Attack Type', fontsize=12)\n", + " plt.title('Bidirectional IP Traffic by Attack Type', fontsize=14, fontweight='bold')\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# SPOOFING DETECTION 4: Private IP Detection\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"SPOOFING INDICATOR 4: PRIVATE IP ADDRESS DETECTION\")\n", + "print(\"=\"*80)\n", + "\n", + "def is_private_ip(ip):\n", + " \"\"\"Check if an IP is in private range (RFC 1918)\"\"\"\n", + " if pd.isna(ip):\n", + " return False\n", + " try:\n", + " parts = str(ip).split('.')\n", + " if len(parts) != 4:\n", + " return False\n", + " first = int(parts[0])\n", + " second = int(parts[1])\n", + " \n", + " # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x\n", + " if first == 10:\n", + " return True\n", + " if first == 172 and 16 <= second <= 31:\n", + " return True\n", + " if first == 192 and second == 168:\n", + " return True\n", + " return False\n", + " except:\n", + " return False\n", + "\n", + "df['src_is_private'] = df['Source IP Address'].apply(is_private_ip)\n", + "df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip)\n", + "\n", + "print(f\"\\nPrivate IP Statistics:\")\n", + "print(f\" - Source IPs from private ranges: {df['src_is_private'].sum():,} ({df['src_is_private'].sum()/len(df)*100:.2f}%)\")\n", + "print(f\" - Destination IPs from private ranges: {df['dst_is_private'].sum():,} ({df['dst_is_private'].sum()/len(df)*100:.2f}%)\")\n", + "print(f\" - Total connections involving private IPs: {(df['src_is_private'] | df['dst_is_private']).sum():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze private IP usage by attack type\n", + "if 'Attack Type' in df.columns:\n", + " print(\"\\nPrivate IP Usage by Attack Type:\")\n", + " attack_private = df.groupby('Attack Type').agg({\n", + " 'src_is_private': ['sum', 'mean'],\n", + " 'dst_is_private': ['sum', 'mean']\n", + " })\n", + " attack_private.columns = ['Src_Private_Count', 'Src_Private_Pct', 'Dst_Private_Count', 'Dst_Private_Pct']\n", + " attack_private['Src_Private_Pct'] = attack_private['Src_Private_Pct'] * 100\n", + " attack_private['Dst_Private_Pct'] = attack_private['Dst_Private_Pct'] * 100\n", + " print(attack_private)\n", + " \n", + " # Visualize\n", + " fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + " \n", + " attack_private['Src_Private_Pct'].plot(kind='bar', ax=axes[0], color='orange', edgecolor='black')\n", + " axes[0].set_title('Source Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n", + " axes[0].set_xlabel('Attack Type')\n", + " axes[0].set_ylabel('Percentage')\n", + " axes[0].tick_params(axis='x', rotation=45)\n", + " \n", + " attack_private['Dst_Private_Pct'].plot(kind='bar', ax=axes[1], color='red', edgecolor='black')\n", + " axes[1].set_title('Destination Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n", + " axes[1].set_xlabel('Attack Type')\n", + " axes[1].set_ylabel('Percentage')\n", + " axes[1].tick_params(axis='x', rotation=45)\n", + " \n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Geo-location analysis (if available)\n", + "if 'Geo-location Data' in df.columns:\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"GEO-LOCATION ANALYSIS\")\n", + " print(\"=\"*80)\n", + " \n", + " print(f\"\\nGeo-location Statistics:\")\n", + " print(f\" - Unique locations: {df['Geo-location Data'].nunique():,}\")\n", + " print(f\" - Missing values: {df['Geo-location Data'].isna().sum():,}\")\n", + " \n", + " # Top locations\n", + " print(f\"\\nTop 15 Geo-locations:\")\n", + " top_locations = df['Geo-location Data'].value_counts().head(15)\n", + " print(top_locations)\n", + " \n", + " # Visualize\n", + " plt.figure(figsize=(14, 8))\n", + " top_locations.plot(kind='barh', color='skyblue', edgecolor='black')\n", + " plt.xlabel('Frequency', fontsize=12)\n", + " plt.ylabel('Geo-location', fontsize=12)\n", + " plt.title('Top 15 Geo-locations in Attack Traffic', fontsize=14, fontweight='bold')\n", + " plt.gca().invert_yaxis()\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + " # Geo-location by attack type\n", + " if 'Attack Type' in df.columns:\n", + " print(f\"\\nTop Geo-location by Attack Type:\")\n", + " for attack in df['Attack Type'].unique():\n", + " top_loc = df[df['Attack Type'] == attack]['Geo-location Data'].value_counts().head(1)\n", + " if len(top_loc) > 0:\n", + " print(f\" {attack}: {top_loc.index[0]} ({top_loc.values[0]} occurrences)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### šŸ“Š Key Findings - IP Trends & Spoofing\n", + "\n", + "**Summary:**\n", + "- Write your key findings here\n", + "- How many suspicious IPs detected (fan-out/fan-in)?\n", + "- Any DDoS targets identified?\n", + "- Private IP issues?\n", + "- Geographic patterns?\n", + "\n", + "**Red Flags Identified:**\n", + "- List specific suspicious IPs or patterns\n", + "\n", + "**Recommendation for ML Model:**\n", + "- Create features: source_fanout_score, dest_fanin_score, is_bidirectional, is_private_ip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 4. PART 3: Data Bin Trends Analysis\n", + "\n", + "**Analysis Goals:**\n", + "1. Packet Length distribution and binning\n", + "2. Port usage patterns (well-known, registered, dynamic)\n", + "3. Protocol distribution\n", + "4. Anomaly score categorization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\"*80)\n", + "print(\"DATA BIN TRENDS ANALYSIS\")\n", + "print(\"=\"*80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. PACKET LENGTH ANALYSIS\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"PACKET LENGTH DISTRIBUTION\")\n", + "print(\"=\"*80)\n", + "\n", + "if 'Packet Length' in df.columns:\n", + " packet_lengths = df['Packet Length'].dropna()\n", + " \n", + " print(f\"\\nPacket Length Statistics:\")\n", + " print(f\" - Mean: {packet_lengths.mean():.2f} bytes\")\n", + " print(f\" - Median: {packet_lengths.median():.2f} bytes\")\n", + " print(f\" - Std Dev: {packet_lengths.std():.2f} bytes\")\n", + " print(f\" - Min: {packet_lengths.min():.2f} bytes\")\n", + " print(f\" - Max: {packet_lengths.max():.2f} bytes\")\n", + " print(f\" - 25th percentile: {packet_lengths.quantile(0.25):.2f} bytes\")\n", + " print(f\" - 75th percentile: {packet_lengths.quantile(0.75):.2f} bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create packet length bins\n", + "bins_packet = [0, 64, 128, 256, 512, 1024, 2048, float('inf')]\n", + "labels_packet = ['0-64', '64-128', '128-256', '256-512', '512-1024', '1024-2048', '2048+']\n", + "df['packet_length_bin'] = pd.cut(df['Packet Length'], bins=bins_packet, labels=labels_packet)\n", + "\n", + "packet_bin_dist = df['packet_length_bin'].value_counts().sort_index()\n", + "print(f\"\\nPacket Length Bins Distribution:\")\n", + "print(packet_bin_dist)\n", + "print(f\"\\nPercentage Distribution:\")\n", + "print((packet_bin_dist / packet_bin_dist.sum() * 100).round(2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize packet length\n", + "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + "\n", + "# Histogram\n", + "axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)\n", + "axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11)\n", + "axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11)\n", + "axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold')\n", + "axes[0, 0].set_yscale('log')\n", + "axes[0, 0].grid(True, alpha=0.3)\n", + "\n", + "# Binned distribution\n", + "packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')\n", + "axes[0, 1].set_xlabel('Packet Length Bins (bytes)', fontsize=11)\n", + "axes[0, 1].set_ylabel('Count', fontsize=11)\n", + "axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold')\n", + "axes[0, 1].tick_params(axis='x', rotation=45)\n", + "\n", + "# Box plot by attack type\n", + "if 'Attack Type' in df.columns:\n", + " df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0])\n", + " axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n", + " axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11)\n", + " axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold')\n", + " axes[1, 0].get_figure().suptitle('') # Remove default title\n", + " plt.sca(axes[1, 0])\n", + " plt.xticks(rotation=45, ha='right')\n", + "\n", + "# Bins by attack type (stacked bar)\n", + "if 'Attack Type' in df.columns:\n", + " bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100\n", + " bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10')\n", + " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", + " axes[1, 1].set_ylabel('Percentage', fontsize=11)\n", + " axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold')\n", + " axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1, 1].tick_params(axis='x', rotation=45)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. PORT ANALYSIS\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"PORT USAGE ANALYSIS\")\n", + "print(\"=\"*80)\n", + "\n", + "if 'Source Port' in df.columns and 'Destination Port' in df.columns:\n", + " # Source ports\n", + " print(f\"\\nTop 10 Source Ports:\")\n", + " top_src_ports = df['Source Port'].value_counts().head(10)\n", + " print(top_src_ports)\n", + " \n", + " # Destination ports\n", + " print(f\"\\nTop 10 Destination Ports:\")\n", + " top_dst_ports = df['Destination Port'].value_counts().head(10)\n", + " print(top_dst_ports)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create port categories\n", + "def categorize_port(port):\n", + " \"\"\"Categorize ports into well-known, registered, or dynamic\"\"\"\n", + " if pd.isna(port):\n", + " return 'Unknown'\n", + " try:\n", + " port = int(port)\n", + " if 0 <= port <= 1023:\n", + " return 'Well-known (0-1023)'\n", + " elif 1024 <= port <= 49151:\n", + " return 'Registered (1024-49151)'\n", + " elif 49152 <= port <= 65535:\n", + " return 'Dynamic (49152-65535)'\n", + " else:\n", + " return 'Unknown'\n", + " except:\n", + " return 'Unknown'\n", + "\n", + "df['dst_port_category'] = df['Destination Port'].apply(categorize_port)\n", + "df['src_port_category'] = df['Source Port'].apply(categorize_port)\n", + "\n", + "print(f\"\\nDestination Port Categories:\")\n", + "print(df['dst_port_category'].value_counts())\n", + "print(f\"\\nPercentage:\")\n", + "print((df['dst_port_category'].value_counts() / len(df) * 100).round(2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize port analysis\n", + "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + "\n", + "# Top source ports\n", + "top_src_ports_15 = df['Source Port'].value_counts().head(15)\n", + "axes[0, 0].barh(range(len(top_src_ports_15)), top_src_ports_15.values, color='lightgreen')\n", + "axes[0, 0].set_yticks(range(len(top_src_ports_15)))\n", + "axes[0, 0].set_yticklabels(top_src_ports_15.index)\n", + "axes[0, 0].set_xlabel('Frequency', fontsize=11)\n", + "axes[0, 0].set_ylabel('Port Number', fontsize=11)\n", + "axes[0, 0].set_title('Top 15 Source Ports', fontsize=13, fontweight='bold')\n", + "axes[0, 0].invert_yaxis()\n", + "\n", + "# Top destination ports\n", + "top_dst_ports_15 = df['Destination Port'].value_counts().head(15)\n", + "axes[0, 1].barh(range(len(top_dst_ports_15)), top_dst_ports_15.values, color='lightcoral')\n", + "axes[0, 1].set_yticks(range(len(top_dst_ports_15)))\n", + "axes[0, 1].set_yticklabels(top_dst_ports_15.index)\n", + "axes[0, 1].set_xlabel('Frequency', fontsize=11)\n", + "axes[0, 1].set_ylabel('Port Number', fontsize=11)\n", + "axes[0, 1].set_title('Top 15 Destination Ports', fontsize=13, fontweight='bold')\n", + "axes[0, 1].invert_yaxis()\n", + "\n", + "# Port category pie chart\n", + "port_cat_dist = df['dst_port_category'].value_counts()\n", + "axes[1, 0].pie(port_cat_dist.values, labels=port_cat_dist.index, autopct='%1.1f%%', startangle=90)\n", + "axes[1, 0].set_title('Destination Port Categories', fontsize=13, fontweight='bold')\n", + "\n", + "# Port categories by attack type\n", + "if 'Attack Type' in df.columns:\n", + " port_attack = pd.crosstab(df['Attack Type'], df['dst_port_category'])\n", + " port_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='Set3')\n", + " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", + " axes[1, 1].set_ylabel('Count', fontsize=11)\n", + " axes[1, 1].set_title('Port Categories by Attack Type', fontsize=13, fontweight='bold')\n", + " axes[1, 1].legend(title='Port Category', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1, 1].tick_params(axis='x', rotation=45)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. PROTOCOL ANALYSIS\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"PROTOCOL DISTRIBUTION\")\n", + "print(\"=\"*80)\n", + "\n", + "if 'Protocol' in df.columns:\n", + " protocol_dist = df['Protocol'].value_counts()\n", + " print(f\"\\nProtocol Distribution:\")\n", + " print(protocol_dist)\n", + " print(f\"\\nPercentage:\")\n", + " print((protocol_dist / protocol_dist.sum() * 100).round(2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize protocol analysis\n", + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "# Protocol pie chart\n", + "axes[0].pie(protocol_dist.values, labels=protocol_dist.index, autopct='%1.1f%%', startangle=90)\n", + "axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold')\n", + "\n", + "# Protocol by attack type\n", + "if 'Attack Type' in df.columns:\n", + " protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100\n", + " protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis')\n", + " axes[1].set_xlabel('Attack Type', fontsize=12)\n", + " axes[1].set_ylabel('Percentage', fontsize=12)\n", + " axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold')\n", + " axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1].tick_params(axis='x', rotation=45)\n", + " \n", + " print(f\"\\nProtocol Usage by Attack Type (%):\")\n", + " print(protocol_attack.round(2))\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 4. ANOMALY SCORES ANALYSIS\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"ANOMALY SCORES DISTRIBUTION\")\n", + "print(\"=\"*80)\n", + "\n", + "if 'Anomaly Scores' in df.columns:\n", + " anomaly_scores = df['Anomaly Scores'].dropna()\n", + " \n", + " print(f\"\\nAnomaly Score Statistics:\")\n", + " print(f\" - Mean: {anomaly_scores.mean():.4f}\")\n", + " print(f\" - Median: {anomaly_scores.median():.4f}\")\n", + " print(f\" - Std Dev: {anomaly_scores.std():.4f}\")\n", + " print(f\" - Min: {anomaly_scores.min():.4f}\")\n", + " print(f\" - Max: {anomaly_scores.max():.4f}\")\n", + " print(f\" - 25th percentile: {anomaly_scores.quantile(0.25):.4f}\")\n", + " print(f\" - 50th percentile: {anomaly_scores.quantile(0.50):.4f}\")\n", + " print(f\" - 75th percentile: {anomaly_scores.quantile(0.75):.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create anomaly score categories based on quartiles\n", + "anomaly_bins = [anomaly_scores.min(), \n", + " anomaly_scores.quantile(0.25),\n", + " anomaly_scores.quantile(0.5),\n", + " anomaly_scores.quantile(0.75),\n", + " anomaly_scores.max()]\n", + "anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)']\n", + "\n", + "df['anomaly_category'] = pd.cut(df['Anomaly Scores'], bins=anomaly_bins, \n", + " labels=anomaly_labels, include_lowest=True)\n", + "\n", + "anomaly_cat_dist = df['anomaly_category'].value_counts().sort_index()\n", + "print(f\"\\nAnomaly Score Categories:\")\n", + "print(anomaly_cat_dist)\n", + "print(f\"\\nPercentage:\")\n", + "print((anomaly_cat_dist / anomaly_cat_dist.sum() * 100).round(2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize anomaly scores\n", + "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", + "\n", + "# Histogram\n", + "axes[0, 0].hist(anomaly_scores, bins=50, color='purple', alpha=0.7, edgecolor='black')\n", + "axes[0, 0].set_xlabel('Anomaly Score', fontsize=11)\n", + "axes[0, 0].set_ylabel('Frequency', fontsize=11)\n", + "axes[0, 0].set_title('Anomaly Score Distribution', fontsize=13, fontweight='bold')\n", + "axes[0, 0].grid(True, alpha=0.3)\n", + "\n", + "# Category bar chart\n", + "colors = ['green', 'yellow', 'orange', 'red']\n", + "axes[0, 1].bar(range(len(anomaly_cat_dist)), anomaly_cat_dist.values, \n", + " color=colors, edgecolor='black')\n", + "axes[0, 1].set_xticks(range(len(anomaly_cat_dist)))\n", + "axes[0, 1].set_xticklabels(anomaly_cat_dist.index, rotation=45, ha='right')\n", + "axes[0, 1].set_xlabel('Anomaly Category', fontsize=11)\n", + "axes[0, 1].set_ylabel('Count', fontsize=11)\n", + "axes[0, 1].set_title('Anomaly Score Categories', fontsize=13, fontweight='bold')\n", + "\n", + "# Box plot by attack type\n", + "if 'Attack Type' in df.columns:\n", + " df.boxplot(column='Anomaly Scores', by='Attack Type', ax=axes[1, 0])\n", + " axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n", + " axes[1, 0].set_ylabel('Anomaly Score', fontsize=11)\n", + " axes[1, 0].set_title('Anomaly Scores by Attack Type', fontsize=13, fontweight='bold')\n", + " axes[1, 0].get_figure().suptitle('')\n", + " plt.sca(axes[1, 0])\n", + " plt.xticks(rotation=45, ha='right')\n", + "\n", + "# Category by attack type\n", + "if 'Attack Type' in df.columns:\n", + " anomaly_attack = pd.crosstab(df['Attack Type'], df['anomaly_category'], normalize='index') * 100\n", + " anomaly_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], color=colors)\n", + " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", + " axes[1, 1].set_ylabel('Percentage', fontsize=11)\n", + " axes[1, 1].set_title('Anomaly Categories by Attack Type (%)', fontsize=13, fontweight='bold')\n", + " axes[1, 1].legend(title='Anomaly Level', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1, 1].tick_params(axis='x', rotation=45)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### šŸ“Š Key Findings - Data Bin Trends\n", + "\n", + "**Summary:**\n", + "- Write your key findings here\n", + "- What are the dominant packet sizes per attack type?\n", + "- Which ports are most targeted?\n", + "- Protocol preferences?\n", + "- Anomaly score patterns?\n", + "\n", + "**Attack Signatures Identified:**\n", + "- DDoS: [packet size pattern, protocol, ports]\n", + "- Malware: [packet size pattern, protocol, ports]\n", + "- etc.\n", + "\n", + "**Recommendation for ML Model:**\n", + "- Use binned features: packet_length_bin, port_category, anomaly_category" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 5. COMPREHENSIVE SUMMARY & INSIGHTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*80)\n", + "print(\"COMPREHENSIVE EDA SUMMARY\")\n", + "print(\"=\"*80)\n", + "\n", + "print(f\"\\nšŸ“Š DATASET OVERVIEW\")\n", + "print(\"-\" * 80)\n", + "print(f\"Total Records: {len(df):,}\")\n", + "print(f\"Total Features: {df.shape[1]}\")\n", + "print(f\"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n", + "\n", + "if 'Attack Type' in df.columns:\n", + " print(f\"\\nšŸŽÆ ATTACK TYPE DISTRIBUTION\")\n", + " print(\"-\" * 80)\n", + " attack_dist = df['Attack Type'].value_counts()\n", + " for attack, count in attack_dist.items():\n", + " print(f\" {attack}: {count:,} ({count/len(df)*100:.2f}%)\")\n", + "\n", + "print(f\"\\nšŸ” KEY STATISTICS\")\n", + "print(\"-\" * 80)\n", + "\n", + "# Proxy\n", + "if 'has_proxy' in df.columns:\n", + " proxy_pct = (df['has_proxy'].sum() / len(df)) * 100\n", + " print(f\" - Proxy Usage Rate: {proxy_pct:.2f}%\")\n", + "\n", + "# IPs\n", + "if 'Source IP Address' in df.columns:\n", + " print(f\" - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n", + " print(f\" - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n", + "\n", + "# Packet Length\n", + "if 'Packet Length' in df.columns:\n", + " print(f\" - Average Packet Size: {df['Packet Length'].mean():.2f} bytes\")\n", + "\n", + "# Protocol\n", + "if 'Protocol' in df.columns:\n", + " top_protocol = df['Protocol'].value_counts().index[0]\n", + " top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100\n", + " print(f\" - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)\")\n", + "\n", + "# Port\n", + "if 'Destination Port' in df.columns:\n", + " top_port = df['Destination Port'].value_counts().index[0]\n", + " top_port_count = df['Destination Port'].value_counts().values[0]\n", + " print(f\" - Most Targeted Port: {top_port} ({top_port_count:,} times)\")\n", + "\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"āœ… EDA ANALYSIS COMPLETE!\")\n", + "print(\"=\"*80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 6. EXPORT ENGINEERED FEATURES (Optional)\n", + "\n", + "Create new features based on EDA insights for ML model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a summary of engineered features\n", + "engineered_features = df[[\n", + " 'has_proxy', # Binary: 0/1\n", + " 'is_bidirectional', # Binary: 0/1 \n", + " 'src_is_private', # Binary: 0/1\n", + " 'dst_is_private', # Binary: 0/1\n", + " 'packet_length_bin', # Categorical: 7 categories\n", + " 'dst_port_category', # Categorical: 3 categories\n", + " 'src_port_category', # Categorical: 3 categories\n", + " 'anomaly_category' # Categorical: 4 categories\n", + "]].copy()\n", + "\n", + "print(\"Engineered Features Summary:\")\n", + "print(engineered_features.head(10))\n", + "print(f\"\\nShape: {engineered_features.shape}\")\n", + "print(f\"\\nFeature Data Types:\")\n", + "print(engineered_features.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: Save engineered features to CSV\n", + "# engineered_features.to_csv('engineered_features.csv', index=False)\n", + "# print(\"āœ“ Engineered features saved to 'engineered_features.csv'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 7. CONCLUSIONS & RECOMMENDATIONS\n", + "\n", + "### Key Findings:\n", + "1. **Proxy Usage:**\n", + " - [Your findings here]\n", + " \n", + "2. **IP Spoofing Indicators:**\n", + " - [Your findings here]\n", + " \n", + "3. **Data Bin Patterns:**\n", + " - [Your findings here]\n", + "\n", + "### Recommendations for ML Model:\n", + "1. Binary features: `has_proxy`, `is_bidirectional`, `src_is_private`, `dst_is_private`\n", + "2. Categorical features: `packet_length_bin`, `port_category`, `anomaly_category`\n", + "3. Numerical features: Consider creating fan-out/fan-in scores\n", + "4. Attack-specific patterns identified can guide feature importance analysis\n", + "\n", + "### Next Steps:\n", + "1. Data preprocessing (handle missing values, encode categoricals)\n", + "2. Feature scaling/normalization\n", + "3. Address class imbalance if needed\n", + "4. Model selection and training\n", + "5. Hyperparameter tuning\n", + "6. Model evaluation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}