import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split, KFold, GridSearchCV from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.svm import SVR from sklearn.metrics import mean_squared_error, r2_score from sklearn.preprocessing import StandardScaler import numpy as np from bidi.algorithm import get_display # Load the dataset df = pd.read_excel('data.xlsx') df = df[df[df.columns[-1]] != 0] X = df[df.columns[0:10]] # predictors y = df[df.columns[-1]] # target # Define hyperparameter grids for GridSearchCV param_grid_rf = { 'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10, 15], 'min_samples_split': [2, 5, 10] } param_grid_gb = { 'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [3, 5, 8] } param_grid_svr = { 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf'] } models = { "Linear Regression": LinearRegression(), "Random Forest": GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5), "Gradient Boosting": GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5), "Support Vector Regression": GridSearchCV(SVR(), param_grid_svr, cv=5) } kf = KFold(n_splits=5, shuffle=True, random_state=None) model_scores = {} model_mse_scores = {} for name, model in models.items(): r2_scores = [] mse_scores = [] for train_index, test_index in kf.split(X): X_train_kf, X_test_kf = X.iloc[train_index], X.iloc[test_index] y_train_kf, y_test_kf = y.iloc[train_index], y.iloc[test_index] # Scale the features inside the loop scaler = StandardScaler() X_train_kf = scaler.fit_transform(X_train_kf) X_test_kf = scaler.transform(X_test_kf) model.fit(X_train_kf, y_train_kf) if isinstance(model, GridSearchCV): best_model = model.best_estimator_ predictions = best_model.predict(X_test_kf) else: predictions = model.predict(X_test_kf) r2 = r2_score(y_test_kf, predictions) mse = mean_squared_error(y_test_kf, predictions) r2_scores.append(r2) mse_scores.append(mse) model_scores[name] = r2_scores model_mse_scores[name] = mse_scores # Plot plt.figure(figsize=(15, 10)) # Define the number of models num_models = len(models) # Define colors or patterns for each model to differentiate them on the plot colors = ['tab:pink', 'tab:cyan', 'tab:blue', 'tab:purple'] # add more if needed bar_width = 0.2 positions = np.arange(len(X.columns)) for idx, (name, model) in enumerate(models.items()): if "best_estimator_" in dir(model) and "feature_importances_" in dir(model.best_estimator_): importances = model.best_estimator_.feature_importances_ * 100 # Convert to percentage indices = np.argsort(importances) corrected_labels = [get_display(X.columns[i]) for i in indices] # Shift bars for each model pos_shifted = positions + idx * bar_width bars = plt.barh(pos_shifted, importances[indices], bar_width, align='center', color=colors[idx], alpha=0.6, label=name) # Annotate importance percentages on the graph next to each bar for bar, imp in zip(bars, importances[indices]): plt.text(bar.get_width() + 0.5, # x position bar.get_y() + bar.get_height()/2, # y position '{:.2f}%'.format(imp), # text ha='left', va='center', fontsize=14, fontname="Times New Roman") plt.yticks(positions + bar_width * (num_models - 1) / 2, corrected_labels, fontname="Times New Roman", fontsize=20) # centering yticks plt.xticks(fontsize=20, fontname="Times New Roman") plt.xlabel(get_display('חשיבות (%)'), fontsize=24, fontname="Times New Roman") # Updated xlabel plt.ylabel(get_display('מאפיינים'), fontsize=24, fontname="Times New Roman") plt.title(get_display('חשיבות מנורמלת של כל מאפיין עבור בניית המודל'), fontsize=24, fontname="Times New Roman") plt.legend(loc='lower right', fontsize=14) plt.tight_layout() # Adjust layout for better appearance plt.savefig('feature_importances_combined.png') plt.close() # Evaluation results fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 20)) # Setting a main title plt.suptitle('Model performance across KFold splits', fontsize=24, fontname="Times New Roman") medianprops = dict(color='black') # Box plot for R^2 scores r2_data = [model_scores[model_name] for model_name in models.keys()] bp1 = axes[0].boxplot(r2_data, vert=True, patch_artist=True, widths=0.6, medianprops=medianprops) axes[0].set_ylabel('R^2', fontsize=20, fontname="Times New Roman") axes[0].set_xticklabels(models.keys(), rotation=45, fontsize=15, fontname="Times New Roman") axes[0].grid(True, alpha=0.2) # Box plot for MSE scores mse_data = [model_mse_scores[model_name] for model_name in models.keys()] bp2 = axes[1].boxplot(mse_data, vert=True, patch_artist=True, widths=0.6, medianprops=medianprops) axes[1].set_ylabel('Mean squared error (MSE)', fontsize=20, fontname="Times New Roman") axes[1].set_xticklabels(models.keys(), rotation=45, fontsize=15, fontname="Times New Roman") axes[1].grid(True, alpha=0.2) axes[0].tick_params(axis='y', labelsize=20, labelcolor='black', direction='out', width=1, length=5, which='major') axes[1].tick_params(axis='y', labelsize=20, labelcolor='black', direction='out', width=1, length=5, which='major') for tick in axes[0].get_yticklabels(): tick.set_fontname("Times New Roman") for tick in axes[1].get_yticklabels(): tick.set_fontname("Times New Roman") colors = ['tab:pink', 'tab:cyan', 'tab:blue', 'tab:purple'] for patch1, patch2, color in zip(bp1['boxes'], bp2['boxes'], colors): patch1.set_facecolor(color) patch2.set_facecolor(color) plt.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.savefig('boxplots_scores.png') plt.close() print("R^2 Scores:") for model_name, scores in zip(models.keys(), r2_data): print(f"{model_name}: {', '.join(map(str, scores))}") print("\nMSE Scores:") for model_name, scores in zip(models.keys(), mse_data): print(f"{model_name}: {', '.join(map(str, scores))}") # Summarize the results for each model summary_texts = [] for model_name in models.keys(): r2_mean = np.mean(model_scores[model_name]) r2_std = np.std(model_scores[model_name]) mse_mean = np.mean(model_mse_scores[model_name]) mse_std = np.std(model_mse_scores[model_name]) summary_text = ( f"Model: {model_name}\n" f"-------------------------\n" f"Average R^2 Score: {r2_mean:.3f} ± {r2_std:.3f}\n" f"Average Mean Squared Error (MSE): {mse_mean:.3f} ± {mse_std:.3f}\n\n" ) summary_texts.append(summary_text) # Combine all model summaries into a single report report_text = "\n".join(summary_texts) # Print and save the report with open("model_performance_summary.txt", "w") as file: file.write(report_text) print(report_text)