#A5
#Develop an anomaly detection system for high-dimensional network traffic data using the KDD Cup 1999 dataset
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_kddcup99
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load only a subset of the data for speed (optional: use 20,000 samples)
kdd = fetch_kddcup99(percent10=True, shuffle=True)
X_raw = kdd["data"][:20000]
y_raw = kdd["target"][:20000]

# Convert to DataFrame
df = pd.DataFrame(X_raw, columns=kdd["feature_names"])

# Binary label: 0 for normal, 1 for anomaly
df["binary_label"] = np.where(y_raw == b'normal.', 0, 1)

# One-hot encode categorical columns (with limited unique categories)
df = pd.get_dummies(df, columns=["protocol_type", "service", "flag"], drop_first=True)

# Features and label
X = df.drop(['binary_label'], axis=1)
y = df['binary_label']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for faster training (keep only 10 components)
pca = PCA(n_components=10, random_state=42)
X_reduced = pca.fit_transform(X_scaled)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# KNN with fewer neighbors for speed
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)  # Use all CPU cores
knn.fit(X_train, y_train)

# Prediction & Evaluation
y_pred = knn.predict(X_test)
print(f"Accuracy: {knn.score(X_test, y_test) * 100:.2f}%")
print(classification_report(y_test, y_pred))
