import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline


data = pd.DataFrame({
    "x": [0.86, 0.09, -0.85, 0.87, -0.44, np.nan, -1.10, 0.40, -0.96, 0.17,
          0.86, 5.5, -3.2],
    "y": [2.49, 0.83, -0.25, 3.10, np.nan, 0.02, -0.12, 1.81, -0.83, 0.43,
          2.49, 20, -15]
})

print("=== Aperçu des données brutes ===")
print(data)
print("\nDimensions :", data.shape)

=== Aperçu des données brutes ===
       x      y
0   0.86   2.49
1   0.09   0.83
2  -0.85  -0.25
3   0.87   3.10
4  -0.44    NaN
5    NaN   0.02
6  -1.10  -0.12
7   0.40   1.81
8  -0.96  -0.83
9   0.17   0.43
10  0.86   2.49
11  5.50  20.00
12 -3.20 -15.00

Dimensions : (13, 2)


print("=== Valeurs manquantes ===")
print(data.isnull().sum())

data = data.dropna()
print("\n=== Données après suppression des NaN ===")
print(data)
print("\nDimensions :", data.shape)

=== Valeurs manquantes ===
x    1
y    1
dtype: int64

=== Données après suppression des NaN ===
       x      y
0   0.86   2.49
1   0.09   0.83
2  -0.85  -0.25
3   0.87   3.10
6  -1.10  -0.12
7   0.40   1.81
8  -0.96  -0.83
9   0.17   0.43
10  0.86   2.49
11  5.50  20.00
12 -3.20 -15.00

Dimensions : (11, 2)


print("=== Lignes dupliquées ===")
print(data[data.duplicated(keep=False)])

data = data.drop_duplicates()
print("\n=== Données après suppression des doublons ===")
print(data)

plt.figure(figsize=(6,4))
plt.boxplot([data["x"], data["y"]], labels=["x", "y"])
plt.title("Boxplot de x et y (détection des valeurs aberrantes)")
plt.show()

data = data[(data["x"].between(-2, 2)) & (data["y"].between(-5, 5))]
print("\n=== Données après filtrage des valeurs extrêmes ===")
print(data)
print("\nDimensions finales :", data.shape)

=== Lignes dupliquées ===
       x     y
0   0.86  2.49
10  0.86  2.49

=== Données après suppression des doublons ===
       x      y
0   0.86   2.49
1   0.09   0.83
2  -0.85  -0.25
3   0.87   3.10
6  -1.10  -0.12
7   0.40   1.81
8  -0.96  -0.83
9   0.17   0.43
11  5.50  20.00
12 -3.20 -15.00

C:\Users\PC\AppData\Local\Temp\ipykernel_11392\3151276952.py:9: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([data["x"], data["y"]], labels=["x", "y"])

=== Données après filtrage des valeurs extrêmes ===
      x     y
0  0.86  2.49
1  0.09  0.83
2 -0.85 -0.25
3  0.87  3.10
6 -1.10 -0.12
7  0.40  1.81
8 -0.96 -0.83
9  0.17  0.43

Dimensions finales : (8, 2)


X = data["x"].values.reshape(-1, 1)
y = data["y"].values.reshape(-1, 1)

X_b = np.c_[X, np.ones((len(X), 1))]

print("Shape de X_b :", X_b.shape)
print("Shape de y   :", y.shape)
print("\nX_b :\n", X_b)
print("\ny :\n", y)

Shape de X_b : (8, 2)
Shape de y   : (8, 1)

X_b :
 [[ 0.86  1.  ]
 [ 0.09  1.  ]
 [-0.85  1.  ]
 [ 0.87  1.  ]
 [-1.1   1.  ]
 [ 0.4   1.  ]
 [-0.96  1.  ]
 [ 0.17  1.  ]]

y :
 [[ 2.49]
 [ 0.83]
 [-0.25]
 [ 3.1 ]
 [-0.12]
 [ 1.81]
 [-0.83]
 [ 0.43]]


train_size = int(0.8 * len(X_b))

X_train = X_b[:train_size]
y_train = y[:train_size]

X_test = X_b[train_size:]
y_test = y[train_size:]

print("Shape X_train :", X_train.shape)
print("Shape y_train :", y_train.shape)
print("Shape X_test  :", X_test.shape)
print("Shape y_test  :", y_test.shape)

Shape X_train : (6, 2)
Shape y_train : (6, 1)
Shape X_test  : (2, 2)
Shape y_test  : (2, 1)


XT_X = X_train.T.dot(X_train)
XT_y = X_train.T.dot(y_train)

print("X^T X :\n", XT_X)
print("\nX^T y :\n", XT_y)

w = np.linalg.inv(XT_X).dot(XT_y)
print("\nCoefficients w (pente, intercept) :\n", w)

X^T X :
 [[3.5971 0.27  ]
 [0.27   6.    ]]

X^T y :
 [[5.9816]
 [7.86  ]]

Coefficients w (pente, intercept) :
 [[1.56986848]
 [1.23935592]]


y_pred_test = X_test.dot(w)

print("Prédictions (test) :", y_pred_test.flatten())
print("Valeurs réelles (test) :", y_test.flatten())

Prédictions (test) : [-0.26771782  1.50623356]
Valeurs réelles (test) : [-0.83  0.43]


plt.figure(figsize=(6,4))
plt.scatter(X, y, label="Données", alpha=0.7)

x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
x_line_b = np.c_[x_line, np.ones((len(x_line), 1))]
y_line = x_line_b.dot(w)

plt.plot(x_line, y_line, color="red", label="Modèle linéaire")
plt.xlabel("x")
plt.ylabel("y")
plt.title("Régression linéaire - Modèle entraîné")
plt.legend()
plt.show()


y_pred_train = X_train.dot(w)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"MSE train : {mse_train:.4f}")
print(f"MSE test  : {mse_test:.4f}")
print(f"R² train  : {r2_train:.4f}")
print(f"R² test   : {r2_test:.4f}")

MSE train : 0.1201
MSE test  : 0.7372
R² train  : 0.9246
R² test   : -0.8574


np.save("modele_lineaire.npy", w)
print("Vecteur de paramètres w sauvegardé dans 'modele_lineaire.npy'")

Vecteur de paramètres w sauvegardé dans 'modele_lineaire.npy'

TP Régression Linéaire complète¶

Version : Pr M.El Alami (corrigé)¶

0. Imports et préparation¶

1. Collecte du dataset¶

2. Nettoyage des données (Data Cleaning)¶

2.1. Valeurs manquantes¶

2.2. Doublons et valeurs aberrantes¶

3. Feature Engineering¶

4. Séparation du dataset Training / Test¶

5. Entraîner le modèle (Train ML Algorithm)¶

6. Tester le modèle sur le jeu de test¶

7. Visualisation du modèle¶

8. Évaluer les performances du modèle¶

9. Boucle d'amélioration & sauvegarde¶