import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
from bidi.algorithm import get_display

# Load the dataset
df = pd.read_excel('data.xlsx')
df = df[df[df.columns[-1]] != 0]

X = df[df.columns[0:10]]  # predictors
y = df[df.columns[-1]]  # target

# Define hyperparameter grids for GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 8]
}

param_grid_svr = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5),
    "Gradient Boosting": GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5),
    "Support Vector Regression": GridSearchCV(SVR(), param_grid_svr, cv=5)
}

kf = KFold(n_splits=5, shuffle=True, random_state=None)
model_scores = {}
model_mse_scores = {}

for name, model in models.items():
    r2_scores = []
    mse_scores = []

    for train_index, test_index in kf.split(X):
        X_train_kf, X_test_kf = X.iloc[train_index], X.iloc[test_index]
        y_train_kf, y_test_kf = y.iloc[train_index], y.iloc[test_index]

        # Scale the features inside the loop
        scaler = StandardScaler()
        X_train_kf = scaler.fit_transform(X_train_kf)
        X_test_kf = scaler.transform(X_test_kf)

        model.fit(X_train_kf, y_train_kf)
        if isinstance(model, GridSearchCV):
            best_model = model.best_estimator_
            predictions = best_model.predict(X_test_kf)
        else:
            predictions = model.predict(X_test_kf)

        r2 = r2_score(y_test_kf, predictions)
        mse = mean_squared_error(y_test_kf, predictions)

        r2_scores.append(r2)
        mse_scores.append(mse)

    model_scores[name] = r2_scores
    model_mse_scores[name] = mse_scores

# Plot

plt.figure(figsize=(15, 10))

# Define the number of models
num_models = len(models)

# Define colors or patterns for each model to differentiate them on the plot
colors = ['tab:pink', 'tab:cyan', 'tab:blue', 'tab:purple']  # add more if needed

bar_width = 0.2  
positions = np.arange(len(X.columns))

for idx, (name, model) in enumerate(models.items()):
    if "best_estimator_" in dir(model) and "feature_importances_" in dir(model.best_estimator_):
        importances = model.best_estimator_.feature_importances_ * 100  # Convert to percentage
        indices = np.argsort(importances)

        corrected_labels = [get_display(X.columns[i]) for i in indices]

        # Shift bars for each model
        pos_shifted = positions + idx * bar_width

        bars = plt.barh(pos_shifted, importances[indices], bar_width, align='center', color=colors[idx], alpha=0.6, label=name)

        # Annotate importance percentages on the graph next to each bar
        for bar, imp in zip(bars, importances[indices]):
            plt.text(bar.get_width() + 0.5,  # x position
                     bar.get_y() + bar.get_height()/2,  # y position
                     '{:.2f}%'.format(imp),  # text
                     ha='left', va='center', fontsize=14, fontname="Times New Roman")

plt.yticks(positions + bar_width * (num_models - 1) / 2, corrected_labels, fontname="Times New Roman", fontsize=20)  # centering yticks
plt.xticks(fontsize=20, fontname="Times New Roman")
plt.xlabel(get_display('חשיבות (%)'), fontsize=24, fontname="Times New Roman")  # Updated xlabel
plt.ylabel(get_display('מאפיינים'), fontsize=24, fontname="Times New Roman")
plt.title(get_display('חשיבות מנורמלת של כל מאפיין עבור בניית המודל'), fontsize=24, fontname="Times New Roman")
plt.legend(loc='lower right', fontsize=14)
plt.tight_layout()  # Adjust layout for better appearance
plt.savefig('feature_importances_combined.png')
plt.close()


# Evaluation results 
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 20))

# Setting a main title
plt.suptitle('Model performance across KFold splits', fontsize=24, fontname="Times New Roman")

medianprops = dict(color='black')

# Box plot for R^2 scores
r2_data = [model_scores[model_name] for model_name in models.keys()]
bp1 = axes[0].boxplot(r2_data, vert=True, patch_artist=True, widths=0.6, medianprops=medianprops)
axes[0].set_ylabel('R^2', fontsize=20, fontname="Times New Roman")
axes[0].set_xticklabels(models.keys(), rotation=45, fontsize=15, fontname="Times New Roman")
axes[0].grid(True, alpha=0.2)

# Box plot for MSE scores
mse_data = [model_mse_scores[model_name] for model_name in models.keys()]
bp2 = axes[1].boxplot(mse_data, vert=True, patch_artist=True, widths=0.6, medianprops=medianprops)
axes[1].set_ylabel('Mean squared error (MSE)', fontsize=20, fontname="Times New Roman")
axes[1].set_xticklabels(models.keys(), rotation=45, fontsize=15, fontname="Times New Roman")
axes[1].grid(True, alpha=0.2)

axes[0].tick_params(axis='y', labelsize=20, labelcolor='black', direction='out', width=1, length=5, which='major')
axes[1].tick_params(axis='y', labelsize=20, labelcolor='black', direction='out', width=1, length=5, which='major')

for tick in axes[0].get_yticklabels():
    tick.set_fontname("Times New Roman")
for tick in axes[1].get_yticklabels():
    tick.set_fontname("Times New Roman")

colors = ['tab:pink', 'tab:cyan', 'tab:blue', 'tab:purple']
for patch1, patch2, color in zip(bp1['boxes'], bp2['boxes'], colors):
    patch1.set_facecolor(color)
    patch2.set_facecolor(color)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('boxplots_scores.png')
plt.close()

print("R^2 Scores:")
for model_name, scores in zip(models.keys(), r2_data):
    print(f"{model_name}: {', '.join(map(str, scores))}")

print("\nMSE Scores:")
for model_name, scores in zip(models.keys(), mse_data):
    print(f"{model_name}: {', '.join(map(str, scores))}")


# Summarize the results for each model
summary_texts = []

for model_name in models.keys():
    r2_mean = np.mean(model_scores[model_name])
    r2_std = np.std(model_scores[model_name])
    mse_mean = np.mean(model_mse_scores[model_name])
    mse_std = np.std(model_mse_scores[model_name])

    summary_text = (
        f"Model: {model_name}\n"
        f"-------------------------\n"
        f"Average R^2 Score: {r2_mean:.3f} ± {r2_std:.3f}\n"
        f"Average Mean Squared Error (MSE): {mse_mean:.3f} ± {mse_std:.3f}\n\n"
    )

    summary_texts.append(summary_text)

# Combine all model summaries into a single report
report_text = "\n".join(summary_texts)

# Print and save the report
with open("model_performance_summary.txt", "w") as file:
    file.write(report_text)

print(report_text)