# ============================================================
# 1) IMPORTS — comprendre la syntaxe scikit-learn
# ============================================================

# NumPy : calcul numérique
import numpy as np

# matplotlib : visualisation
import matplotlib.pyplot as plt

# scikit-learn : outils Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from sklearn.datasets import load_breast_cancer

# Chargement du dataset
data = load_breast_cancer()

# X : matrice des features
# y : labels (0 ou 1)
X = data.data
y = data.target

print("Dimensions de X :", X.shape)
print("Dimensions de y :", y.shape)

Dimensions de X : (569, 30)
Dimensions de y : (569,)

array([0, 0, 0, 0, 0])


# train_test_split :
# test_size=0.3 -> 30% test, 70% train
# random_state=42 -> reproductibilité
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Train :", X_train.shape)
print("Test  :", X_test.shape)

Train : (398, 30)
Test  : (171, 30)


# StandardScaler :
# met chaque feature à moyenne 0 et écart-type 1
scaler = StandardScaler()

# fit : calcule moyenne et écart-type sur le train
# transform : applique la transformation
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# LogisticRegression :
# max_iter augmenté pour assurer la convergence
model = LogisticRegression(max_iter=1000)

# Entraînement du modèle
model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)


# Prédictions
y_pred = model.predict(X_test_scaled)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy :", acc)

# Matrice de confusion
print("Matrice de confusion :")
print(confusion_matrix(y_test, y_pred))

# Rapport détaillé
print("Rapport de classification :")
print(classification_report(y_test, y_pred))

Accuracy : 0.9824561403508771
Matrice de confusion :
[[ 62   1]
 [  2 106]]
Rapport de classification :
              precision    recall  f1-score   support

           0       0.97      0.98      0.98        63
           1       0.99      0.98      0.99       108

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	None
	solver	'lbfgs'
	max_iter	1000
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

TP 2 — Régression Logistique avec scikit-learn¶

Notebook Jupyter — Python & scikit-learn très bien commentés¶

2) Chargement d'un dataset réel (Breast Cancer)¶

3) Séparation Train / Test¶

4) Normalisation des données (StandardScaler)¶

5) Modèle de Régression Logistique¶

6) Prédiction et évaluation¶

7) Interprétation¶