projecte_Prediccio_Exit_Academic/prediccio_exit_academic.py

# -*- coding: utf-8 -*-
"""Projecte mida.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1eZ29LldjXYTRp203aWEcHdlFj3qP0KAd

#**Predicció de l èxit o abandonament acadèmic dels estudiants**
"""

# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets as ds
import sklearn.model_selection as cv
import sklearn.neighbors as nb
import seaborn as sns

import pandas
from pandas import plotting

# %matplotlib inline

#WEB DE LES  DADES
#https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

# lectura del dataset des de la web
!pip3 install -U ucimlrepo
!pip3 install --upgrade certifi

from ucimlrepo import fetch_ucirepo

# fetch dataset
#predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Now try to fetch the dataset again
dades = fetch_ucirepo(id=697)
# data (as pandas dataframes)
X = dades.data.features
Y = dades.data.targets

# metadata
#print(predict_students_dropout_and_academic_success.metadata)

# variable information
print(dades.variables)
dades2=dades.data

"""**Exemple primeres 20 files del dataset amb el seu Target**"""

print(X.head(20))
print(Y.head(20))

"""

---


# Inspecció de les dades"""

#columnes del dataset
print("Informació de cada columna: ")
names=dades['data']['features'].info()

print("Comprovació dels noms de les columnes: ")
print(dades2.features.columns)

print("-----------")
print("Estadistiques de les Columnes (Variables) (X):")
descript_X = X.describe(include="all")
print(descript_X)

print("Descripció dels valors de Columnes :")
valunicX=X.apply(lambda col:col.unique())
print(valunicX)

print("-----------")
print("Estadistiques de les Files (Target) (y):")
descript_Y = Y.describe(include="all")
print(descript_Y)

print("Descripció dels valors de Target :")
valunicY=dades2.targets['Target'].unique()
print(valunicY)
#Fixar sobre tot amb els valors de UNIQUE i veure possibles errades

import matplotlib.pyplot as plt
import matplotlib.cm as cm
#PER ELS COLORS!!!

num_colors = 3
cmap = cm.get_cmap('plasma', num_colors)

# Get a list of colors in names (not codes)
colors = [cmap(i) for i in range(num_colors)]

# Display the list of colors
print(colors)

#Relacio del genere entre els estudiants
gender_counts = dades2.features['Gender'].value_counts()
colors2=[(0.798216, 0.280197, 0.469538, 1.0), (0.993814, 0.704741, 0.183043, 1.0)]

gender_counts.plot(kind='bar',color=colors2)

plt.xlabel('Genere')
plt.xticks([0, 1], ['Dona', 'Home'])#1 male 0 female
plt.ylabel('Numero destudiants')
plt.title('Distribució del genere dels estudiants')

plt.show()

#Relació d'edat dels estudiants
num_bins = 20
age_range = (18, 70)

plt.figure(figsize=(10, 7))
plt.hist(dades2.features["Age at enrollment"], bins=num_bins,range=age_range,color=(0.798216, 0.280197, 0.469538, 1.0), edgecolor=(0.050383, 0.029803, 0.527975, 1.0))

plt.title("Edat de la inscripció")
plt.xlabel("Edat")
plt.ylabel("Frequencia")
plt.xticks(range(18, 70, 2))

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Relacion entre estudiants natius i internacionals International = 0, Nationality = 1
nat = dades2.features[(dades2.features["International"] == 0) & (dades2.features["Nacionality"] == 1)].shape[0]
inter = dades2.features[dades2.features["International"] == 1].shape[0]

#pie chart
labels = ['Natius', 'Internacionals']
sizes = [nat, inter]
explode = (0.1, 0)
colors2=[(0.798216, 0.280197, 0.469538, 1.0), (0.993814, 0.704741, 0.183043, 1.0)]

plt.pie(sizes, explode=explode, labels=labels, colors=colors2, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title('Distribució entre estudiants natius i internacionals ')
plt.show()

#Relació entre les diferents nacionalitats
nat_noms = {
    1: 'Portuguese', 2: 'German', 6: 'Spanish', 11: 'Italian', 13: 'Dutch',
    14: 'English', 17: 'Lithuanian', 21: 'Angolan', 22: 'Cape Verdean',
    24: 'Guinean', 25: 'Mozambican', 26: 'Santomean', 32: 'Turkish',
    41: 'Brazilian', 62: 'Romanian', 100: 'Moldova (Republic of)',
    101: 'Mexican', 103: 'Ukrainian', 105: 'Russian', 108: 'Cuban',
    109: 'Colombian'
}
nat_num = dades2.features['Nacionality'].map(nat_noms).value_counts()
colors=[(0.050383, 0.029803, 0.527975, 1.0),
 (0.362553, 0.003243, 0.649245, 1.0),
  (0.610667, 0.090204, 0.619951, 1.0),
   (0.798216, 0.280197, 0.469538, 1.0),
    (0.928329, 0.472975, 0.326067, 1.0),
     (0.993814, 0.704741, 0.183043, 1.0),
        (0.940015, 0.975158, 0.131326, 1.0)]

#Pie chart nacionalitats
labels = nat_num.index
sizes = nat_num.values
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Distribució destudiants per nacionalitat')
plt.show()

#bar chart nacioalitats
plt.figure(figsize=(12, 6))
nat_num.plot(kind='bar', color=colors)
plt.xlabel('Nacionalitat')
plt.ylabel('Número destudiants')
plt.title('Numero destudiants per Nacionalitat')
plt.xticks(rotation=45, ha='right')
plt.show()

#Extreure la nacionalitat Portuguesa- Contar només els internacionals
nat_num = dades2.features['Nacionality'].map(nat_noms)
nat_num = nat_num[nat_num != 'Portuguese'].value_counts()

plt.figure(figsize=(12, 6))
nat_num.plot(kind='bar', color=colors)
plt.xlabel('Nacionalitat')
plt.ylabel('Número destudiants')
plt.title('Numero destudiants per Nacionalitat (Excloint nacionalitat Portuguesa)')
plt.xticks(rotation=45, ha='right')

plt.show()

#Qualificació previa a la universitat
prev_noms = {
    1: 'Secondary education',
    2: "Higher education - bachelor's degree",
    3: 'Higher education - degree',
    4: "Higher education - master's",
    5: 'Higher education - doctorate',
    6: 'Frequency of higher education',
    9: '12th year of schooling - not completed',
    10: '11th year of schooling - not completed',
    12: 'Other - 11th year of schooling',
    14: '10th year of schooling',
    15: '10th year of schooling - not completed',
    19: 'Basic education 3rd cycle (9th/10th/11th year) or equiv.',
    38: 'Basic education 2nd cycle (6th/7th/8th year) or equiv.',
    39: 'Technological specialization course',
    40: 'Higher education - degree (1st cycle)',
    42: 'Professional higher technical course',
    43: 'Higher education - master (2nd cycle)'
}
prev_num = dades2.features['Previous qualification'].map(prev_noms).value_counts()

colors=[(0.050383, 0.029803, 0.527975, 1.0),
 (0.362553, 0.003243, 0.649245, 1.0),
  (0.610667, 0.090204, 0.619951, 1.0),
   (0.798216, 0.280197, 0.469538, 1.0),
    (0.928329, 0.472975, 0.326067, 1.0),
     (0.993814, 0.704741, 0.183043, 1.0),
        (0.940015, 0.975158, 0.131326, 1.0)]

plt.figure(figsize=(12, 6))
plt.xlabel("Qualificacions prèvies")
plt.ylabel("Número d'estudiants")
plt.title("Número d'estudiants per qualificacións prèvies a la universitat")
prev_num.plot(kind='bar',color=colors)
plt.xticks(rotation=45, ha='right')

plt.show()

# Relacio estudiants per tipus d'aplicació
applic_nom = {
    1: '1st phase - general contingent',
    2: 'Ordinance No. 612/93',
    5: '1st phase - special contingent (Azores Island)',
    7: 'Holders of other higher courses',
    10: 'Ordinance No. 854-B/99',
    15: 'International student (bachelor)',
    16: '1st phase - special contingent (Madeira Island)',
    17: '2nd phase - general contingent',
    18: '3rd phase - general contingent',
    26: 'Ordinance No. 533-A/99, item b2) (Different Plan)',
    27: 'Ordinance No. 533-A/99, item b3 (Other Institution)',
    39: 'Over 23 years old',
    42: 'Transfer',
    43: 'Change of course',
    44: 'Technological specialization diploma holders',
    51: 'Change of institution/course',
    53: 'Short cycle diploma holders',
    57: 'Change of institution/course (International)'
}
applic_num = dades2.features['Application mode'].map(applic_nom).value_counts()

colors=[(0.050383, 0.029803, 0.527975, 1.0),
 (0.362553, 0.003243, 0.649245, 1.0),
  (0.610667, 0.090204, 0.619951, 1.0),
   (0.798216, 0.280197, 0.469538, 1.0),
    (0.928329, 0.472975, 0.326067, 1.0),
     (0.993814, 0.704741, 0.183043, 1.0),
        (0.940015, 0.975158, 0.131326, 1.0)]

plt.figure(figsize=(12, 6))
plt.xlabel("Tipus d'inscripció")
plt.ylabel("Número d'estudiants")
plt.title("Número d'estudiants per tipus d'inscripció")
applic_num.plot(kind='bar',color=colors)
plt.xticks(rotation=45, ha='right')

plt.show()

#Relació del grau i el nombre d'estudiants
course_noms = {
    33: 'Biofuel Production Technologies',
    171: 'Animation and Multimedia Design',
    8014: 'Social Service (evening attendance)',
    9003: 'Agronomy',
    9070: 'Communication Design',
    9085: 'Veterinary Nursing',
    9119: 'Informatics Engineering',
    9130: 'Equinculture',
    9147: 'Management',
    9238: 'Social Service',
    9254: 'Tourism',
    9500: 'Nursing',
    9556: 'Oral Hygiene',
    9670: 'Advertising and Marketing Management',
    9773: 'Journalism and Communication',
    9853: 'Basic Education',
    9991: 'Management (evening attendance)'
}

course_counts = dades2.features['Course'].map(course_noms).value_counts()

colors = [
    (0.050383, 0.029803, 0.527975, 1.0),
    (0.362553, 0.003243, 0.649245, 1.0),
    (0.610667, 0.090204, 0.619951, 1.0),
    (0.798216, 0.280197, 0.469538, 1.0),
    (0.928329, 0.472975, 0.326067, 1.0),
    (0.993814, 0.704741, 0.183043, 1.0),
    (0.940015, 0.975158, 0.131326, 1.0)
]

plt.figure(figsize=(12, 6))
plt.xlabel("Graus universitaris")
plt.ylabel("Número d'estudiants")
plt.title("Número d'estudiants per graus universitaris")
course_counts.plot(kind='bar', color=colors)
plt.xticks(rotation=45, ha='right')

plt.show()

#Matriu de correlació entre atributs
corr = dades2.features.corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(35, 35))
sns.heatmap(corr, cmap='plasma', annot=True, square=True)

plt.title("Corelació Heatmap entre els atributs")
plt.show()

##histogram - visual representation
sns.set_theme(style = 'ticks')
dades2.features.hist(bins=10, figsize=(40, 35), grid=True, legend=None,color='pink');

"""##Relació de les dades amb l'Objectiu"""

#Estadistiques OBJECTIU
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
colors =[(0.050383, 0.029803, 0.527975, 1.0), (0.798216, 0.280197, 0.469538, 1.0),(0.993814, 0.704741, 0.183043, 1.0)]

#amb valors reals
dades2.targets["Target"].value_counts().plot.bar(
    ax=axes[0],
    title="Abandonament i èxit acadèmic dels estudiants",
    ylabel="Número d'estudiants",
    xlabel="Objectiu",
    rot=0,
    color=colors
)
#amb percentatges
dades2.targets["Target"].value_counts(normalize=True).plot.bar(
    ax=axes[1],
    title="Percentatge d'abandonament i èxit acadèmic dels estudiants",
    ylabel="Percentatge d'estudiants",
    xlabel="Objectiu",
    rot=0,
    color=colors
)

axes[0].bar_label(axes[0].containers[0], label_type="center", color='w')
axes[1].bar_label(axes[1].containers[0], fmt='%.2g', label_type="center", color='w')

plt.show()

#Relacio de l'objectiu amb el genere del estudiant
colors =[(0.050383, 0.029803, 0.527975, 1.0), (0.798216, 0.280197, 0.469538, 1.0),(0.993814, 0.704741, 0.183043, 1.0)]

perc1 = pd.crosstab(dades2.targets["Target"], dades2.features["Gender"]).apply(lambda r: r / r.sum(), axis=1)
perc1=perc1.reindex(["Graduate", "Dropout", "Enrolled"])
ax = perc1.transpose().plot.bar(
    figsize = (10,8),
    title = "Grafica d'abandonament i èxit acadèmic per gènere dels estudiants",
    xlabel= "Gènere",
    ylabel = "Percentatge d'estudiants",
    rot=0,
    color=colors,
    fontsize = 12
)
ax.set_xticklabels(("Female", "Male"))

for p in ax.containers:
    ax.bar_label(p, fmt='%.2f',label_type='center', fontsize=14, color='w')

#Relació de l'objectiu amb l'estat civil dels estudiants

marital_status_perc = pd.crosstab(dades2.features["Marital Status"], dades2.targets["Target"], normalize='index')
marital_status_perc = marital_status_perc.reindex(columns=["Graduate", "Dropout", "Enrolled"])
colors = [(0.050383, 0.029803, 0.527975, 1.0),
          (0.798216, 0.280197, 0.469538, 1.0),
          (0.993814, 0.704741, 0.183043, 1.0)]

ax = marital_status_perc.plot(kind="bar", figsize=(10, 6), title="Grafica d'abandonament i èxit acadèmic per estat civil dels estudiants", color=colors)

plt.xlabel("Estat civil")
plt.xticks([0, 1, 2, 3, 4, 5], ['Single', 'Married', 'Widower', 'Divorced', 'Facto Union', 'Legally Separated'], rotation=0)

# Llegenda
ax.legend(["Graduate", "Dropout", "Enrolled"], loc=9)

#calcular percentatges per cada apartat
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height:.2%}', (x + width/2, y + height), ha='center', fontsize=12)

plt.show()

#Relacio de l'objectiu amb el l'horari del estudiant
colors =[(0.050383, 0.029803, 0.527975, 1.0), (0.798216, 0.280197, 0.469538, 1.0),(0.993814, 0.704741, 0.183043, 1.0)]

perc1 = pd.crosstab(dades2.targets["Target"], dades2.features["Daytime/evening attendance"]).apply(lambda r: r / r.sum(), axis=1)
perc1=perc1.reindex(["Graduate", "Dropout", "Enrolled"])
ax = perc1.transpose().plot.bar(
    figsize = (10,8),
    title = "Grafica d'abandonament i èxit acadèmic per horari dels estudiants",
    xlabel= "Horari",
    ylabel = "Percentatge d'estudiants",
    rot=0,
    color=colors,
    fontsize = 12
)
ax.set_xticklabels(("Evening (Nit)", "Daytime (Dia)"))

for p in ax.containers:
    ax.bar_label(p, fmt='%.2f',label_type='center', fontsize=14, color='w')

"""---
#   Preprocessament de les dades

## Preprocessament
"""

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

#tornar a carregar dades per possibles modificacions anteriors des del DRIVE
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
dades_p = pd.read_csv('drive/My Drive/MIDA/data.csv',sep=';')

dades_p.head()

#mirar si hi han atributs buits
atr_buit = dades_p.columns.isnull().sum()

#mirar si hi han objectiu buit
obj_buit = dades_p["Target"].isnull().sum()

print("Valors buits dels atributs: ")
print(atr_buit)

print("\nValors buits del objectiu: ")
print(obj_buit)

#mirar si hi han files duplicades
dup_fil =dades_p[dades_p.duplicated(keep=False)]
dup_num = dup_fil.sum()

print("Numero de files duplicades:")
print(dup_num)

#mirar les files duplicades
files = dades_p[dup_fil]
print("Files duplicades :")
print(files)

print("Si llista surt NAN significa que no hi ha cap fila duplicada")

"""S'han eliminat correctament els atributs triats i passa a ser un dataset de 37 columnes a 31 columnes amb tots els atributs numerals (no categorics) i amb només els estudiants o que ja s'han graduat o han abandonat

## Divisió de les dades
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Assuming using SVM for illustration
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


X = dades_p.drop('Target', axis=1)
y=dades_p["Target"]

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#Ratios de Training i Test
split_ratios = [(0.6, 0.2), (0.7, 0.15), (0.8, 0.1)]
results = []

for train_ratio, val_ratio in split_ratios:
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=1-train_ratio, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_ratio/(val_ratio + (1-train_ratio-val_ratio)), random_state=42)

    #Crear model
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    val_accuracy = accuracy_score(y_val, model.predict(X_val))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    #Guardar resultats segosn el ratio
    results.append({'train_ratio': train_ratio, 'val_ratio': val_ratio, 'val_accuracy': val_accuracy, 'test_accuracy': test_accuracy})

# Representació grafica entre ratios
train_ratios = [r['train_ratio'] for r in results]
val_accuracies = [r['val_accuracy'] for r in results]
test_accuracies = [r['test_accuracy'] for r in results]

plt.figure(figsize=(10, 6))
plt.plot(train_ratios, val_accuracies, label=' Accuracy de la Validació',color='darkblue')
plt.plot(train_ratios, test_accuracies, label='Accuracy del Test',color='deeppink')
plt.xlabel('Training Set Ratio')
plt.ylabel('Accuracy')
plt.title('Model rendiment entre Train/Validation Ratios')
plt.legend()
plt.show()

"""## Atributs relacionats amb l'objectiu
Revisar cada atribut la seva relació amb l'objectiu, per aixi aconseguir una millor comprensió de les relacions dins del dataset.
Treure els estudiants que estan estudiant actualment ja que només volem els que ja s'han graduat o ho han abandonat.

"""

#tornar a carregar dades per possibles modificacions anteriors des del DRIVE
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
dades_p = pd.read_csv('drive/My Drive/MIDA/data.csv',sep=';')

dades_p.head()

dades_p.columns

print("Verificar que tots els atributs i targets siguin de tipus numeral")
dades_p.info()
print("Si el Dtype(tipus de variable) és int64 o float64 significa que és numeral. ")

#Target orignalment de tipus categoric
# Eliminar els estudiants que estan actualment estudiant
print("Abans treure estudiants que actualment esta estudien: ")
print(dades_p.loc[:,'Target'].value_counts())
print("Estudiants totals",dades_p['Target'].value_counts().sum(),'\n')

#Treure els estudiants Enrolled -> fixarem en si s'han graduat o no
print("Despres de treure els estudiants que actualment estudien: ")
dades_p=dades_p.loc[(dades_p['Target']=='Graduate') | (dades_p['Target'] == 'Dropout')].copy()
print(dades_p.loc[:,'Target'].value_counts())
print("Estudiants totals",dades_p['Target'].value_counts().sum(),'\n')

#Canviar de categoric a NUMERAL
print("----------")
print("Canviar el target d'un atribut categoric a numeral: ")
dades_p['Target'] =LabelEncoder().fit_transform(dades_p['Target'])
print(dades_p.loc[:,'Target'].value_counts())
print("Estudiants totals",dades_p['Target'].value_counts().sum(),'\n')

print("--> 1 son els estudiants que s'han graduat i 0 els que no ho han fet")

#Dividir els atributs per tipus i veure la seva relació amb l'objectiu

#Dades demogràfiques
demo= dades_p[["Marital status", "Nacionality", "Displaced", "Gender",
              "Educational special needs", "Age at enrollment", "International", "Target"]]

#Dades Socio Economiques
soec=dades_p[["Mother's qualification", "Father's qualification", "Mother's occupation",
             "Father's occupation",
      "Debtor", "Tuition fees up to date", "Scholarship holder", "Target"]]

#Dades academiques
acad=dades_p[['Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t',
             'Previous qualification','Previous qualification (grade)',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 'Target']]

#Dades del proces inscripció
insc=dades_p[['Unemployment rate', 'Inflation rate','Admission grade','GDP','Target']]

#Comprovar que tots els atributs han estat redistribuits
atributs_falten = set(dades_p.columns) - set(soec.columns)- set(demo.columns)- set(acad.columns)- set(insc.columns)
print("Els atributs que falta redistribuir: ",atributs_falten)

#Veure la relació amb l'objectiu fent servir matrius de corelació- heatmaps

fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(demo.corr(), annot=True, cmap="plasma")
plt.title("Dades Demografiques")
plt.show()

"""Es pot observar que els atributs que tenen més correlació són: **International/Nacionality**

  La majoria d'estudiants son *Portuguesos* i ens podria portar problemes d'influència en els models de classificació

Els altres atributs tenen una correlació correcta amb el Target.
"""

#Veure la relació amb l'objectiu fent servir matrius de corelació- heatmaps

fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(soec.corr(), annot=True, cmap="plasma")
plt.title("Dades SocioEconomiques")
plt.show()

"""Es pot observar que els atributs que tenen més correlació són: **Father's occupation/Mother's occupation**  pero és un atribut interessant per veure la seva influència en els models.

Els altres atributs tenen una correlació correcta amb el Target.
"""

#Veure la relació amb l'objectiu fent servir matrius de corelació- heatmaps

fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(acad.corr(), annot=True, cmap="plasma")
plt.title("Dades Academiques")
plt.show()

"""Es pot observar que els atributs que tenen més correlació són:

**Curricular units 1st/2nd sem (credited)**

**Curricular units 1st/2nd sem (enrolled)**

**Curricular units 1st/2nd sem (evaluations)**

**Curricular units 1st/2nd sem (approved)**

**Curricular units 1st/2nd sem (grade)**

 Prou elevats ( 0.80 en amunt) per afectar en el dataset.

Els altres atributs tenen una correlació correcta amb el Target.
"""

#Veure la relació amb l'objectiu fent servir matrius de corelació- heatmaps

fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(insc.corr(), annot=True, cmap="plasma")
plt.title("Dades estat de la Inscripció")
plt.show()

"""Els atributs tenen una correlació correcta amb el target

### Prova eliminacó d'atributs amb més correlació
"""

#Atributs amb alta correlació amb l'objectiu treure del dataset
atributs_treure=dades_p[[
    'Nacionality','Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)'
]]
dades_noat=dades_p.copy()
dades_noat.drop(columns=atributs_treure.columns, inplace=True)

dades_noat.head()

#Comprovació del preprocessament
dades_og= pd.read_csv('drive/My Drive/MIDA/data.csv',sep=';')
print("Dades originals:",dades_og.shape)
print("Dades sent preprocessades: (nomes graduats/no graduats) ",dades_p.shape)
print("Dades completament preprocessades (sense atributs amb alta correlació):",dades_noat.shape)

#dataset preprocessat graduats/no graduats
corr = dades_p.corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(35, 35))
sns.heatmap(corr, cmap='plasma', annot=True, square=True)
plt.title("Corelació Heatmap entre els atributs estudiants graduats/no graduats")
plt.show()

# Correlation Matrix sense els atributs que vols eliminar
fig, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(dades_noat.corr(), annot=True, cmap="plasma", fmt='.2f', annot_kws={"size": 10})
plt.title("Corelació Heatmap entre els atributs estudiants graduats/no graduats (Sense atributs d'alta correlació)")
plt.show()

"""

---


# Data mining models
"""

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

#tornar a carregar dades per possibles modificacions anteriors des del DRIVE
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data = pd.read_csv('drive/My Drive/MIDA/data.csv',sep=';')

data.head()

#No fent servir collab notebook
#Tenint el dataset a la mateixa carpeta del programa

#df=pd.read_csv('data.csv',sep=';')
#df.head()

#Passar les dades categoriques  a numerals
#Eliminar els estudiants que estan actualment estudiant
dm_data=data.copy()
print("Abans treure estudiants que actualment esta estudien: ")
print(dm_data.loc[:,'Target'].value_counts())

#Treure els estudiants Enrolled -> fixarem en si s'han graduat o no
print("Despres de treure els estudiants que actualment estudien: ")
dm_data=dm_data[(dm_data['Target']=='Graduate') | (dm_data['Target'] == 'Dropout')]
print(dm_data.loc[:,'Target'].value_counts())

#Canviar de categoric a NUMERAL
print("----------")
print("Canviar el target d'un atribut categoric a numeral: ")
dm_data['Target'] = LabelEncoder().fit_transform(dm_data['Target'])
print(dm_data.loc[:,'Target'].value_counts())

print("--> 1 son els estudiants que s'han graduat i 0 els que no ho han fet")

"""## Single Fold Cross Validation"""

#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_sf = data.copy()

#Estudiants graduats o han abandonat
data_sf = data_sf[(data_sf['Target'] == 'Graduate') | (data_sf['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_sf['Target'] = LabelEncoder().fit_transform(data_sf['Target'])
data_sf.shape

#trobar millors parametres- max depth per randomforest
from sklearn.model_selection import GridSearchCV
scaler = StandardScaler()

#guardar a X tots els atributs menys el target, a Y el target
X = data_sf.drop('Target', axis=1)
y = data_sf['Target']

scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.2, random_state=10)

param_grid = {'max_depth': [5, 10, 15, 20, None]}


#Grid busqueda per cassificador validation
grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100, random_state=20), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print('Millor  max_depth trobada:', grid_search.best_params_['max_depth'])

#Classificadors Random Forest Classifier
rf = RandomForestClassifier(n_estimators=50,max_depth=grid_search.best_params_['max_depth'],min_samples_split=4, min_samples_leaf=2, random_state=20)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

#Fer Prediccions de les dades
y_pred2 = rf.predict(X_test)

print('\n INFORME MODEL SINGLE FOLD CV \n')
print(classification_report(y_test, y_pred2))

#Matriu de confusió
cm = confusion_matrix(y_test, y_pred2)

print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize= (6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

print("\n RESULTATS ACCURACY")
print("Accuracy: ",accuracy_score(y_test, y_pred2))
print('Training Accuracy (entrenat):',rf.score(X_train,y_train))
print('Testing Accuracy (test):',rf.score(X_test,y_test))

"""## K-Fold Cross Validation"""

#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data2 = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_kf = data2.copy()

#Estudiants graduats o han abandonat
data_kf = data_kf[(data_kf['Target'] == 'Graduate') | (data_kf['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_kf['Target'] = LabelEncoder().fit_transform(data_kf['Target'])
data_kf.shape

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#guardar a X tots els atributs menys el target, a Y el target
X = data_kf.drop('Target', axis=1)
y = data_kf['Target']

scaled = scaler.fit_transform(X)
X_train_full, X_test, y_train_full, y_test = train_test_split(scaled, y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV

#Trobar millor parametre n_neighbors pel model de kfold cross validation
param_grid = {'n_neighbors': range(1, 30)}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_full, y_train_full)

print("Millor número n_neighors:",grid_search.best_params_['n_neighbors'])

# K-Fold Cross-Validation Inicialització
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#metriques
accuracies = []
precisions = []
recalls = []
f1_scores = []

# K-Fold Cross-Validation
for train_index, test_index in kf.split(X_train_full, y_train_full):
    X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
    y_train, y_val = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    #Model amb el nombre n_neighbors millor trobat
    model = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val) #Prediccio per cada fold

    # Calculate metrics
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred, average='weighted'))
    recalls.append(recall_score(y_val, y_pred, average='weighted'))
    f1_scores.append(f1_score(y_val, y_pred, average='weighted'))

#Fer Prediccions de les dades
final_pred = model.predict(X_test)

print('\n INFORME MODEL K FOLD CV \n')
print(classification_report(y_test, final_pred))

#matriu de confusió
cm = confusion_matrix(y_test, final_pred)
print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

# Mitjanes de les k cross-validation
print("\n MITJANES DE CADA FOLD ")
print(f"Average Accuracy: {sum(accuracies) / len(accuracies)}")
print(f"Average Precision: {sum(precisions) / len(precisions)}")
print(f"Average Recall: {sum(recalls) / len(recalls)}")
print(f"Average F1 Score: {sum(f1_scores) / len(f1_scores)}")

"""---


# Machine learning methods

## Naı̈ve Bayes
"""

#Rellegir dades evitar possibles problemes#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')

data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_nb = data.copy()

#Estudiants graduats o han abandonat
data_nb = data_nb[(data_nb['Target'] == 'Graduate') | (data_nb['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_nb['Target'] = LabelEncoder().fit_transform(data_nb['Target'])
data_nb.shape

from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as  plt
import pandas as pd
import numpy as np
print(data_nb.loc[:,'Target'].value_counts())

scaler = StandardScaler()
#guardar a X tots els atributs menys el target, a Y el target
X = data_nb.drop('Target', axis=1)
y = data_nb['Target']
print(X.shape)
print(y.shape)

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=1)

# trobar millors parametres- best_var_smoothing
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# Aplicar a threshold per la probabilitat de les prediccions
def filterp(threshold, probabilities):
    return (probabilities > threshold).astype(int)

#parametres trobar
param_grid_nb = {'var_smoothing': np.logspace(0, -9, num=100)}

#Stratified K-Fold per cross validation
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Model Gaussian Naive Bayes
gnb = GaussianNB()

grid_search_nb = GridSearchCV(estimator=gnb, param_grid=param_grid_nb, cv=cv, scoring='accuracy')
grid_search_nb.fit(X, y)

best_var_smoothing = grid_search_nb.best_params_['var_smoothing']
print(f"Millor var_smoothing: {grid_search_nb.best_params_} Accuracy: {grid_search_nb.best_score_}")

#Naive Bayes amb millors parametres trobats

gnb = GaussianNB(var_smoothing=grid_search_nb.best_params_['var_smoothing'])
gnb.fit(X_train, y_train)

# Prediccions per la CLASSE 1
probs = gnb.predict_proba(X_test)[:, 1]

# Define thresholds to search for the optimal threshold
thresholds = np.linspace(0, 1, 101)

best_threshold = 0
best_f1_score = 0

#Buscar millor threshold
for threshold in thresholds:
    preds = filterp(threshold, probs)
    f1 = f1_score(y_test, preds, pos_label=1)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print(f"Best threshold: {best_threshold} with F1-score: {best_f1_score}")

#Millor threshold trobat, aplicar en prediccions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
final_preds = filterp(best_threshold, probs)


print('\n INFORME MODEL NAIVE BAYES  \n')
print(classification_report(y_test, final_preds))

#Matriu de confusió
cm = confusion_matrix(y_test, final_preds)

print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize= (6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

print("\n RESULTATS ACCURACY")
print("Accuracy: ", accuracy_score(y_test,final_preds))
print('Training Accuracy (entrenat):',gnb.score(X_train,y_train))
print('Testing Accuracy (test):',gnb.score(X_test,y_test))

"""## k-NN"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets as ds
import sklearn.model_selection as cv
import sklearn.neighbors as nb
from sklearn.preprocessing import StandardScaler,LabelEncoder

# %matplotlib inline

#Rellegir dades evitar possibles problemes
data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_knn = data.copy()

#Estudiants graduats o han abandonat
data_knn = data_knn[(data_knn['Target'] == 'Graduate') | (data_knn['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_knn['Target'] = LabelEncoder().fit_transform(data_knn['Target'])
data_knn.shape

#guardar tot el dataset en X menys el TARGET, guardar el TARGET a y
X=data_knn.drop(["Target"],axis=1)
y=data_knn["Target"]

print(X.shape)
print(y.shape)

#Trobar dummies de les dades, escalarles  i dividir entre training i test
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X=pd.get_dummies(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#Trobar millor parametre k neigbours
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

accuracies = []
for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=k)
    cv_scores = cross_val_score(knn, X=X_train, y=y_train, cv=10)
    accuracies.append(np.mean(cv_scores))
    print(f"Accuracy {k} neighbours: {np.mean(cv_scores)}")

#Grafic per veure el comportament de K
lr = []
for ki in range(1,30,2):
    cv_scores = cross_val_score(nb.KNeighborsClassifier(n_neighbors=ki), X=X_train, y=y_train, cv=10)
    lr.append(np.mean(cv_scores))
plt.plot(range(1,30,2),lr,'b',label='No weighting',color='darkblue')

lr = []
for ki in range(1,30,2):
    cv_scores = cross_val_score(nb.KNeighborsClassifier(n_neighbors=ki,weights='distance'), X=X_train, y=y_train, cv=10)
    lr.append(np.mean(cv_scores))
plt.plot(range(1,30,2),lr,'r',label='Weighting', color='deeppink')

plt.xlabel('valor k')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
plt.grid()
plt.tight_layout()

plt.show()

#Trobar millors parametres de k neighbors i si weights(pes) millor distancia o uniform
from sklearn.model_selection import GridSearchCV

params = {'n_neighbors':list(range(1,30,2)), 'weights':('distance','uniform')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=10,n_jobs=-1)
clf.fit(X_train, y_train)

print("Millors Parametres:",clf.best_params_, "Accuracy=", clf.best_score_)

#KNN Classificador

parval=clf.best_params_ #millors parametres calculats
knc = nb.KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])
knc.fit(X_train, y_train)

#Fer Prediccions de les dades
pred=knc.predict(X_test)

print('\n INFORME MODEL KNN \n')
print(classification_report(y_test, pred))

#matriu de confusió
cm = confusion_matrix(y_test, pred)
print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

print("\n RESULTATS ACCURACY")
print("Accuracy ",accuracy_score(y_test, pred))

"""## Decision Trees


"""

# Commented out IPython magic to ensure Python compatibility.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# %matplotlib inline
import sklearn.model_selection as cv
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder

#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_dt = data.copy()

# Filter out certain rows
data_dt = data_dt[(data_dt['Target'] == 'Graduate') | (data_dt['Target'] == 'Dropout')]

# Encode target variable
data_dt['Target'] = LabelEncoder().fit_transform(data_dt['Target'])

#guardar tot el dataset en X menys el TARGET, guardar el TARGET a y
X=data_dt.drop(["Target"],axis=1)
y=data_dt["Target"]

print(X.shape)
print(y.shape)

#Trobar dummies, escalar les dades, dividir el dataset entre  training and test
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

Xn=pd.get_dummies(X)
Xn.head()

Xn, y = shuffle(Xn, y)
X_train, X_test, y_train, y_test = train_test_split(Xn, y, test_size=0.2, random_state=10)

#Trobar millors parametres per decision tree
from sklearn import tree
from sklearn.model_selection import GridSearchCV

#Parametres a optimitzar: criteri, profunditat, divisio nodes, valors fulles
param_grid = {
    'criterion': ['entropy', 'gini'],
    'max_depth': [None,2, 5, 10, 20, 30],
    'min_impurity_decrease': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_samples_split': [2, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 2, 4],
}
# crear model
clf = tree.DecisionTreeClassifier()

# Utilitzar GridSearch trobar els parametres mes optimitzats
grid_search = GridSearchCV(clf, param_grid, cv=5)¡
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

#Obtenir els parametres trobats
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Millors Parameters:", best_params, " Accuracy:", best_accuracy)

#Decision Tree
#Imprimir l'arbre
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

best_tree = grid_search.best_estimator_ #millors parametres calculats
best_tree.fit(X_train, y_train)

plt.figure(figsize=(200, 100))
tree.plot_tree(best_tree, filled=True, rounded=True, feature_names=list(Xn.columns.values))
plt.title("Arbre de Decisió ")
plt.show()

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statsmodels.stats.proportion import proportion_confint
import seaborn as sns

#Prediccio de les dades
pred = best_tree.predict(X_test)


print('\n INFORME MODEL DECISION TREE \n')
print(classification_report(y_test, pred))

#matriu de confusió
cm = confusion_matrix(y_test, pred)
print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

print("\n RESULTATS MODEL")
print("Accuracy ",accuracy_score(y_test, pred))
epsilon = sklearn.metrics.accuracy_score(y_test, pred)
print("Interval of confidence:", proportion_confint(count=epsilon*X_test.shape[0], nobs=X_test.shape[0], alpha=0.05, method='binom_test'))

"""## Support Vector Machines"""

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_svm = data.copy()

#Estudiants graduats o han abandonat
data_svm = data_svm[(data_svm['Target'] == 'Graduate') | (data_svm['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_svm['Target'] = LabelEncoder().fit_transform(data_svm['Target'])
data_svm.shape

X = data_svm.drop('Target', axis=1)
y = data_svm['Target']
print(X.shape)
print(y.shape)

#Trobar dummies de les dades, escalarles  i dividir entre training i test
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = MinMaxScaler(feature_range=(-1, 1))

X_train_ = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

"""Mirar pels diferents tipus de kernel que poden tenir el model de suport vectors machines:


*  linear
*  rbf
*  poly

### SVM linear
"""

#Optimitzar SVM linear i els seus parametres
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt

#Trobar cs amb grid search
Cs = np.logspace(-3, 5, num=7, base=10.0)
param_grid_linear = {'C': Cs, 'kernel': ['linear']}
grid_search_linear = GridSearchCV(SVC(), param_grid_linear, cv=10)
grid_search_linear.fit(X_train, y_train)

#Grafic de resultats de Accuracy depenen valors de C
scores_linear = grid_search_linear.cv_results_['mean_test_score']
plt.semilogx(Cs, scores_linear)
plt.title('SVM Linear- Accuracy en parametre C')
plt.xlabel('valor C')
plt.ylabel('Mitjana dels resultats')
plt.show()

best_C_linear = grid_search_linear.best_params_['C'] #trobat millor C

#Aplicar millor parametre
svm_linear = SVC(kernel='linear', C=best_C_linear)
svm_linear.fit(X_train, y_train)
pred_linear = svm_linear.predict(X_test)

#Resultats

print("RESULTATS KERNEL SVM LINEAR:")

cm = confusion_matrix(y_test, pred_linear)

print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

print("ACCURACY en el test set: ", accuracy_score(y_test, pred_linear))
print("Millor parametre C: ", best_C_linear)
print("Numero de suports: ", np.sum(svm_linear.n_support_))
print("Proporció dels supports: ", np.sum(svm_linear.n_support_)/X_train.shape[0])

"""### SVM poly"""

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Trobar cs amb grid search +
Cs = np.logspace(-3, 5, num=4, base=10.0)
param_grid_poly = {'C': Cs, 'kernel': ['poly'], 'degree': [2, 3,4], 'gamma': ['scale']}
grid_search_poly = GridSearchCV(SVC(), param_grid_poly, cv=3, n_jobs=-1, verbose=1)
grid_search_poly.fit(X_train, y_train)

scores_poly = grid_search_poly.cv_results_['mean_test_score']
scores_poly_reshaped = np.array(scores_poly).reshape(len(Cs), len(param_grid_poly['degree']))

# Heatmap of cross-validation accuracy
plt.figure(figsize=(8, 6))
sns.heatmap(scores_poly_reshaped, annot=True, xticklabels=param_grid_poly['degree'], yticklabels=Cs, cmap="plasma")
plt.xlabel('Degree Entrats: 2, 3 o 4')
plt.ylabel('valor C')
plt.title('SVM Polynominal - Grid Search Scores')
plt.show()

#Aplicar millors parametres trobats
best_params_poly = grid_search_poly.best_params_
svm_poly = SVC(kernel='poly', C=best_params_poly['C'], degree=best_params_poly['degree'],
               gamma=best_params_poly['gamma'])
svm_poly.fit(X_train, y_train)
pred_poly = svm_poly.predict(X_test)

# Resultats
print("RESULTATS KERNEL SVM POLYNOMIAL:")

cm_poly = confusion_matrix(y_test, pred_poly)
print("\nMATRIU DE CONFUSIÓ:")
plt.figure(figsize=(6, 4))
sns.heatmap(cm_poly, annot=True, fmt="d", cmap="plasma")
plt.show()

# Accuracy
print("ACCURACY on the test set: ", accuracy_score(y_test, pred_poly))
print("Millors parameters: ", best_params_poly)
print("Numero de suports: ", np.sum(svm_poly.n_support_))
print("Proporció dels supports: ", np.sum(svm_poly.n_support_)/X_train.shape[0])

"""### SVM rbf"""

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Trobar cs + gammas amb gridsearch
gammas = np.logspace(-4, 1, num=5, base=10.0)
Cs = np.logspace(-1, 6, num=7, base=10.0)

param_grid_rbf = {'C': Cs, 'gamma': gammas, 'kernel': ['rbf']}
grid_search_rbf = GridSearchCV(SVC(), param_grid_rbf, cv=5)
grid_search_rbf.fit(X_train, y_train)

#Heatmap relació valors de C entre valors de gamma
scores_rbf = grid_search_rbf.cv_results_['mean_test_score']
scores_rbf = np.array(scores_rbf).reshape(len(Cs), len(gammas))

plt.figure(figsize=(8, 6))
sns.heatmap(scores_rbf, annot=True, fmt=".3f", xticklabels=gammas, yticklabels=Cs, cmap="plasma")
plt.xlabel('valors de Gamma')
plt.ylabel('valor C')
plt.title('SVM RBF  - Grid Search Scores')
plt.show()

best_params_rbf = grid_search_rbf.best_params_

#Aplicar millors parametres trobats al model
svm_rbf = SVC(kernel='rbf', C=best_params_rbf['C'], gamma=best_params_rbf['gamma'])
svm_rbf.fit(X_train, y_train)
pred_rbf = svm_rbf.predict(X_test)

#Resultats

print("RESULTATS KERNEL SVM RBF:")
cm_rbf = confusion_matrix(y_test, pred_rbf)
print("\nMATRIU DE CONFUSIÓ:")
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rbf, annot=True, fmt="d", cmap="plasma")
plt.show()

print("ACCURACY on test set: ", accuracy_score(y_test, pred_rbf))
print("Milllors parameters: ", best_params_rbf)
print("Numero de suports: ", np.sum(svm_rbf.n_support_))
print("Proporció dels supports:  ", np.sum(svm_rbf.n_support_)/X_train.shape[0])

"""### Informe del millor SVM"""

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Trobar cs amb grid search +
Cs = np.logspace(-3, 5, num=4, base=10.0)
param_grid_poly = {'C': Cs, 'kernel': ['poly'], 'degree': [2, 3,4], 'gamma': ['scale']}
grid_search_poly = GridSearchCV(SVC(), param_grid_poly, cv=3, n_jobs=-1, verbose=1)
grid_search_poly.fit(X_train, y_train)

scores_poly = grid_search_poly.cv_results_['mean_test_score']
scores_poly_reshaped = np.array(scores_poly).reshape(len(Cs), len(param_grid_poly['degree']))

#Aplicar millors parametres trobats
best_params_poly = grid_search_poly.best_params_
svm_poly = SVC(kernel='poly', C=best_params_poly['C'], degree=best_params_poly['degree'],
               gamma=best_params_poly['gamma'])
svm_poly.fit(X_train, y_train)
pred_poly = svm_poly.predict(X_test)

# Resultats
print('\n INFORME MODEL SVM POLYNOMINAL  \n')
print(classification_report(y_test, pred_poly))

cm_poly = confusion_matrix(y_test, pred_poly)
print("\nMATRIU DE CONFUSIÓ:")
plt.figure(figsize=(6, 4))
sns.heatmap(cm_poly, annot=True, fmt="d", cmap="plasma")
plt.show()

# Accuracy
print("ACCURACY on the test set: ", accuracy_score(y_test, pred_poly))
print("Millors parameters: ", best_params_poly)
print("Numero de suports: ", np.sum(svm_poly.n_support_))
print("Proporció dels supports: ", np.sum(svm_poly.n_support_)/X_train.shape[0])


"""## Meta-learning algorithms"""

# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import StandardScaler,LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets as ds
import sklearn.model_selection as cv
import sklearn.neighbors as nb
# %matplotlib inline

#Rellegir dades evitar possibles problemes
from google.colab import drive

#from google.colab import drive -demana permis a llegir el DRIVE
drive.mount('/content/drive')
data = pd.read_csv('drive/My Drive/MIDA/data.csv', sep=';')
data_mt = data.copy()

#Estudiants graduats o han abandonat
data_mt = data_mt[(data_mt['Target'] == 'Graduate') | (data_mt['Target'] == 'Dropout')]

#Passar de categoric a numeral
data_mt['Target'] = LabelEncoder().fit_transform(data_mt['Target'])
data_mt.shape

X = data_mt.drop('Target', axis=1)
y = data_mt['Target']
print(X.shape)
print(y.shape)

#Trobar dummies de les dades, escalarles  i dividir entre training i test
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X=pd.get_dummies(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

"""
####Classificadors individuals


*   Gaussian Naive Bayes
*   K-Nearest Neigbors
*   Decision Tree

"""

# Gaussian Naive Bayes
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

cv = 50
clf1 = GaussianNB()
scores = cross_val_score(clf1, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f de Naive Bayes" % scores.mean())

#K-Nearest Neigbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

params = {'n_neighbors': list(range(1, 30, 2)), 'weights': ('distance', 'uniform')}
knc = KNeighborsClassifier()
clf_knn = GridSearchCV(knc, param_grid=params, cv=cv, n_jobs=-1)
clf_knn.fit(X, y)
print("Millors parametres per KNN =", clf_knn.best_params_, "Accuracy =", clf_knn.best_score_)

#Utilitzar els parametres trobats per knn
clf3 = KNeighborsClassifier(n_neighbors=clf_knn.best_params_['n_neighbors'], weights=clf_knn.best_params_['weights'])
scores = cross_val_score(clf3, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f  de KNN " % scores.mean())

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

clf3 = DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(clf3, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f de Decision Tree" % scores.mean())

"""####Voting pels classificadors individuals"""

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

eclf = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)], voting='hard')
scores = cross_val_score(eclf, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean() , "Majority Voting: Hard voting"))

eclf = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)],voting='soft', weights=[2,1,2])
scores = cross_val_score(eclf, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean(), "Weighted Voting: Soft voting"))

"""#### Bagging"""

#Bagging amb el model de Arbre de Decisió
from sklearn.ensemble import BaggingClassifier

print("RESULTATS DECISION TREE BAGGING - ESTIMADORS: ")
lb=[]
best_accuracy=0
best_bagg=0

for nest in [1,2,5,10,20,50,100,200]: #utilitzar max nombre d'estimadors
    scores = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy=scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_bagg = nest
    print(f" Accuracy: {scores.mean():<20} - num estimadors: {nest}")
    lb.append(scores.mean())

print("\n Millor num estimadors", best_bagg, "amb un Accuracy de:", best_accuracy)

#Bagging Decision Tree amblimit per maxim atributs
print("\n RESULTATS DECISION TREE BAGGING - ESTIMADORS i MAX FEATURES: ")
lb2=[]
best_accuracy=0
best_bagg_maxf=0
max_f=0.35
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=nest,max_features=max_f), X, y, cv=cv, scoring='accuracy')
    accuracy=scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_bagg_maxf = nest
    print(f" Accuracy: {scores.mean():<20} - num estimadors: {nest}")
    lb2.append(scores.mean())

print("\n Millor num estimadors", best_bagg_maxf, "amb un Accuracy de:", best_accuracy, " amb maxim features:",max_f)

"""####Random Forest i Extra Trees

"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

print("RESULTATS RANDOM FOREST:")
lext = []
best_accuracy = 0
best_rand_tree = 0

for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    scores = cross_val_score(RandomForestClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy = scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_rand_tree = nest
    print(f" Accuracy: {accuracy:<20} - num arbres: {nest}")
    lext.append(accuracy)

print("\n Millor num d'arbre", best_rand_tree, "amb un Accuracy de:", best_accuracy)

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score

print("RESULTATS EXTRA TREES: ")
lext = []
best_accuracy = 0
best_extra_tree = 0

for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    scores = cross_val_score(ExtraTreesClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy = scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_extra_tree = nest
    print(f" Accuracy: {accuracy:<20} - num arbres: {nest}")
    lext.append(accuracy)

print("\n Millor num d'arbre", best_extra_tree, "amb un Accuracy de:", best_accuracy)

"""

####AdaBoost"""

#Adaboost classificador
from sklearn.ensemble import AdaBoostClassifier

print("RESULTATS ADABOOST:")
lboo=[]
best_accuracy=0
best_ada=0
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy = scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ada = nest
    print(f" Accuracy: {accuracy:<20} - num estimadors: {nest}")
    lboo.append(scores.mean())

print("\n Millor num d'estimadors", best_ada, "amb un Accuracy de:", best_accuracy)

#Adaboost amb maxima profinditat
from sklearn.ensemble import AdaBoostClassifier
print("RESULTATS ADABOOST - MAX DEPTH:")
lboodt=[]
best_accuracy=0
best_ada_maxd=0
max_d=5
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_d),n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy = scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ada_maxd = nest
    print(f" Accuracy: {accuracy:<20} - num estimadors: {nest}")
    lboodt.append(scores.mean())

print("\n Millor num d'estimadors", best_ada_maxd, "amb un Accuracy de:", best_accuracy, "amb màxima profunditat: ",max_d)

"""### Informe del millor algorisme"""

#Resultats segons el millor agorimse de Meta learing

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score

print("RESULTATS EXTRA TREES: ")
lext = []
best_accuracy = 0
best_extra_tree = 0

for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    scores = cross_val_score(ExtraTreesClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    accuracy = scores.mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_extra_tree = nest
  #  print(f" Accuracy: {accuracy:<20} - num arbres: {nest}")
    lext.append(accuracy)

print("\n Millor num d'arbre", best_extra_tree, "amb un Accuracy de:", best_accuracy)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

model = ExtraTreesClassifier(n_estimators=best_extra_tree)
model.fit(X_train, y_train)
pred = model.predict(X_test)

# Informe de Classificació
print('\n INFORME MODEL EXTRA TREES \n')
print(classification_report(y_test, pred))

# Matriu de Confusió
cm = confusion_matrix(y_test, pred)
print("\nMATRIU DE CONFUSIÓ ")
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="plasma")
plt.show()

# Resultats del Model
print("\n RESULTATS MODEL")
print("Accuracy: ", accuracy_score(y_test, pred))