Skip to content

KNN #2

@vshandrikov

Description

@vshandrikov

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import load_iris, load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from IPython.display import display

class KNNClassifier:
"""
KNN-классификатор с нуля.
Поддерживает евклидову и манхэттенскую метрики,
равномерное и взвешенное по расстоянию голосование.
"""

def __init__(self, k=5, metric='euclidean', weighting='uniform', eps=1e-8):
    self.k = k
    self.metric = metric
    self.weighting = weighting
    self.eps = eps
    self.X_train = None
    self.y_train = None

def _distance(self, x1, x2):
    if self.metric == 'euclidean':
        return np.sqrt(np.sum((x1 - x2) ** 2))
    elif self.metric == 'manhattan':
        return np.sum(np.abs(x1 - x2))
    else:
        raise ValueError(f"Метрика {self.metric} не поддерживается")

def _predict_one(self, x):
    distances = [(self._distance(x, x_train), i) for i, x_train in enumerate(self.X_train)]
    distances.sort(key=lambda d: d[0])
    k_nearest = distances[:self.k]

    if self.weighting == 'uniform':
        labels = [self.y_train[i] for _, i in k_nearest]
        return Counter(labels).most_common(1)[0][0]
    else:
        weight_sum = {}
        for dist, i in k_nearest:
            w = 1.0 / (dist + self.eps)
            label = self.y_train[i]
            weight_sum[label] = weight_sum.get(label, 0) + w
        return max(weight_sum.items(), key=lambda kv: kv[1])[0]

def fit(self, X, y):
    self.X_train = np.array(X)
    self.y_train = np.array(y)
    return self

def predict(self, X):
    X = np.array(X)
    return np.array([self._predict_one(x) for x in X])

====================== Функция для полного анализа датасета ======================

def analyze_dataset(dataset_name, X, y, k_max=15, cv_folds=5):
print(f"\n========== Анализ датасета: {dataset_name} ==========")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k_range = range(1, k_max+1)
metrics = ['euclidean', 'manhattan']
weightings = ['uniform', 'distance']

results = []

styles = {
    ('euclidean', 'uniform'):   ('b-', 'Евклидова, равномерное'),
    ('euclidean', 'distance'):  ('g-', 'Евклидова, взвешенное'),
    ('manhattan', 'uniform'):   ('r-', 'Манхэттенская, равномерное'),
    ('manhattan', 'distance'):  ('m-', 'Манхэттенская, взвешенное')
}

fig, ax = plt.subplots(figsize=(10, 6))

for metric in metrics:
    for weighting in weightings:
        cv_scores_by_k = []
        for k in k_range:
            kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
            scores = []
            for train_idx, val_idx in kf.split(X_scaled):
                X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
                
                model = KNNClassifier(k=k, metric=metric, weighting=weighting)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                scores.append(accuracy_score(y_val, y_pred))
            
            mean_score = np.mean(scores)
            cv_scores_by_k.append(mean_score)
            results.append({
                'k': k,
                'metric': metric,
                'weighting': weighting,
                'accuracy': mean_score
            })
        
        label = styles[(metric, weighting)][1]
        color_style = styles[(metric, weighting)][0]
        ax.plot(k_range, cv_scores_by_k, color_style, linewidth=2, label=label)

df_results = pd.DataFrame(results)
best_idx = df_results['accuracy'].idxmax()
best_row = df_results.loc[best_idx]
print(f"\nЛучшие параметры (по кросс-валидации):")
print(f"  k = {best_row['k']:.0f}")
print(f"  метрика = {best_row['metric']}")
print(f"  взвешивание = {best_row['weighting']}")
print(f"  средняя точность = {best_row['accuracy']:.4f}")

ax.set_xlabel('k (число соседей)', fontsize=12)
ax.set_ylabel('Средняя точность на кросс-валидации', fontsize=12)
ax.set_title(f'{dataset_name}: зависимость точности от k', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
fig.tight_layout()

return best_row.to_dict(), df_results, fig

iris = load_iris()
digits = load_digits()

best_iris, df_iris, fig_iris = analyze_dataset("Iris", iris.data, iris.target, k_max=15)
display(fig_iris)
plt.close(fig_iris)

best_digits, df_digits, fig_digits = analyze_dataset("Digits", digits.data, digits.target, k_max=15)
display(fig_digits)
plt.close(fig_digits)

print("\n========== Исследование завершено ==========")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions