{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "410f9559",
   "metadata": {},
   "source": [
    "## Procesos ETL o Data Wrangling\n",
    "### 1. Carga de Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "891b31e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Tamaño:  (768, 9)\n",
      "\n",
      "Columnas= ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Diabetes']\n",
      "\n",
      "Tipos de variable:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      " #   Column                    Non-Null Count  Dtype  \n",
      "---  ------                    --------------  -----  \n",
      " 0   Pregnancies               768 non-null    int64  \n",
      " 1   Glucose                   768 non-null    int64  \n",
      " 2   BloodPressure             768 non-null    int64  \n",
      " 3   SkinThickness             768 non-null    int64  \n",
      " 4   Insulin                   768 non-null    int64  \n",
      " 5   BMI                       768 non-null    float64\n",
      " 6   DiabetesPedigreeFunction  768 non-null    float64\n",
      " 7   Age                       768 non-null    int64  \n",
      " 8   Diabetes                  768 non-null    int64  \n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n",
      "None\n",
      "\n",
      "Estadísticos:\n",
      "       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \\\n",
      "count   768.000000  768.000000     768.000000     768.000000  768.000000   \n",
      "mean      3.845052  120.894531      69.105469      20.536458   79.799479   \n",
      "std       3.369578   31.972618      19.355807      15.952218  115.244002   \n",
      "min       0.000000    0.000000       0.000000       0.000000    0.000000   \n",
      "25%       1.000000   99.000000      62.000000       0.000000    0.000000   \n",
      "50%       3.000000  117.000000      72.000000      23.000000   30.500000   \n",
      "75%       6.000000  140.250000      80.000000      32.000000  127.250000   \n",
      "max      17.000000  199.000000     122.000000      99.000000  846.000000   \n",
      "\n",
      "              BMI  DiabetesPedigreeFunction         Age    Diabetes  \n",
      "count  768.000000                768.000000  768.000000  768.000000  \n",
      "mean    31.992578                428.235091   33.240885    0.348958  \n",
      "std      7.884160                340.485655   11.760232    0.476951  \n",
      "min      0.000000                  0.100000   21.000000    0.000000  \n",
      "25%     27.300000                205.000000   24.000000    0.000000  \n",
      "50%     32.000000                337.000000   29.000000    0.000000  \n",
      "75%     36.600000                591.500000   41.000000    1.000000  \n",
      "max     67.100000               2329.000000   81.000000    1.000000  \n",
      "   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \\\n",
      "0            6      148             72             35        0  33.6   \n",
      "1            1       85             66             29        0  26.6   \n",
      "2            8      183             64              0        0  23.3   \n",
      "3            1       89             66             23       94  28.1   \n",
      "4            0      137             40             35      168  43.1   \n",
      "\n",
      "   DiabetesPedigreeFunction  Age  Diabetes  \n",
      "0                     627.0   50         1  \n",
      "1                     351.0   31         0  \n",
      "2                     672.0   32         1  \n",
      "3                     167.0   21         0  \n",
      "4                    2288.0   33         1  \n"
     ]
    }
   ],
   "source": [
    "# Diabetes\n",
    "#\n",
    "import pandas as pd\n",
    "import os\n",
    "separador=os.sep\n",
    "\n",
    "# Path del archivo\n",
    "carpeta=str(os.path.join(os.getcwd(),\"datasets\"))\n",
    "archivo=str(os.path.join(carpeta,\"diabetes.csv\"))\n",
    "\n",
    "# Convertir archivo a DataFrame de Pandas\n",
    "df_diabetes = pd.read_csv(archivo)\n",
    "\n",
    "# Tamaño del datast\n",
    "print(\"\\nTamaño: \",df_diabetes.shape)\n",
    "\n",
    "# Columnas\n",
    "columnas_diabetes=df_diabetes.columns.tolist()\n",
    "print(\"\\nColumnas=\",columnas_diabetes)\n",
    "\n",
    "# Tipos de variables\n",
    "print(\"\\nTipos de variable:\")\n",
    "print(df_diabetes.info())\n",
    "\n",
    "# Estadisticos:\n",
    "print(\"\\nEstadísticos:\")\n",
    "print(df_diabetes.describe())\n",
    "print(df_diabetes.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "bc1641a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Tamaño:  (891, 12)\n",
      "\n",
      "Columnas= ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']\n",
      "\n",
      "Tipos de variable:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    object \n",
      " 5   Age          714 non-null    float64\n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     889 non-null    object \n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n",
      "None\n",
      "\n",
      "Estadísticos:\n",
      "       PassengerId    Survived      Pclass         Age       SibSp  \\\n",
      "count   891.000000  891.000000  891.000000  714.000000  891.000000   \n",
      "mean    446.000000    0.383838    2.308642   29.699118    0.523008   \n",
      "std     257.353842    0.486592    0.836071   14.526497    1.102743   \n",
      "min       1.000000    0.000000    1.000000    0.420000    0.000000   \n",
      "25%     223.500000    0.000000    2.000000   20.125000    0.000000   \n",
      "50%     446.000000    0.000000    3.000000   28.000000    0.000000   \n",
      "75%     668.500000    1.000000    3.000000   38.000000    1.000000   \n",
      "max     891.000000    1.000000    3.000000   80.000000    8.000000   \n",
      "\n",
      "            Parch        Fare  \n",
      "count  891.000000  891.000000  \n",
      "mean     0.381594   32.204208  \n",
      "std      0.806057   49.693429  \n",
      "min      0.000000    0.000000  \n",
      "25%      0.000000    7.910400  \n",
      "50%      0.000000   14.454200  \n",
      "75%      0.000000   31.000000  \n",
      "max      6.000000  512.329200  \n"
     ]
    }
   ],
   "source": [
    "# Titanic\n",
    "#\n",
    "# URL del dataset en GitHub\n",
    "url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'\n",
    "\n",
    "# Cargar el dataset desde la URL\n",
    "df_titanic = pd.read_csv(url)\n",
    "\n",
    "# Tamaño del datast\n",
    "print(\"\\nTamaño: \",df_titanic.shape)\n",
    "\n",
    "# Columnas\n",
    "columnas_titanic=df_titanic.columns.tolist()\n",
    "print(\"\\nColumnas=\",columnas_titanic)\n",
    "\n",
    "# Tipos de variables\n",
    "print(\"\\nTipos de variable:\")\n",
    "print(df_titanic.info())\n",
    "\n",
    "# Estadisticos:\n",
    "print(\"\\nEstadísticos:\")\n",
    "print(df_titanic.describe())\n",
    "\n",
    "# Guardar CSV\n",
    "nombre_archivo=\"datasets\"+str(separador)+\"titanic.csv\"\n",
    "df_titanic.to_csv(nombre_archivo)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ac831e7",
   "metadata": {},
   "source": [
    "### 2. Inconsistencias y anomalias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "fea3f609",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Valores duplicados: 0\n",
      "\n",
      "Valores faltantes por columna:\n",
      "Pregnancies                 0\n",
      "Glucose                     0\n",
      "BloodPressure               0\n",
      "SkinThickness               0\n",
      "Insulin                     0\n",
      "BMI                         0\n",
      "DiabetesPedigreeFunction    0\n",
      "Age                         0\n",
      "Diabetes                    0\n",
      "dtype: int64\n",
      "\n",
      "Cantidad de Ceros:\n",
      "Glucose: 5\n",
      "BloodPressure: 35\n",
      "SkinThickness: 227\n",
      "Insulin: 374\n",
      "BMI: 11\n",
      "DiabetesPedigreeFunction: 0\n",
      "Age: 0\n",
      "Diabetes: 500\n"
     ]
    }
   ],
   "source": [
    "# Dataset de Diabetes\n",
    "\n",
    "# Identificar valores duplicados \n",
    "duplicados = df_diabetes.duplicated().sum() \n",
    "print(f\"\\nValores duplicados: {duplicados}\") \n",
    "\n",
    "# Identificar valores faltantes \n",
    "valores_faltantes = df_diabetes.isnull().sum() \n",
    "print(\"\\nValores faltantes por columna:\") \n",
    "print(valores_faltantes) \n",
    "\n",
    "# Mostrar Variables con ceros \n",
    "print(\"\\nCantidad de Ceros:\")\n",
    "for columna in columnas_diabetes[1:]:\n",
    "    num_zeros = (df_diabetes[columna] == 0).sum()\n",
    "    print(f\"{columna}: {num_zeros}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e9f7a90b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Valores duplicados: 0\n",
      "\n",
      "Valores faltantes por columna:\n",
      "PassengerId      0\n",
      "Survived         0\n",
      "Pclass           0\n",
      "Name             0\n",
      "Sex              0\n",
      "Age            177\n",
      "SibSp            0\n",
      "Parch            0\n",
      "Ticket           0\n",
      "Fare             0\n",
      "Cabin          687\n",
      "Embarked         2\n",
      "dtype: int64\n",
      "\n",
      "Cantidad de Ceros:\n",
      "PassengerId: 0\n",
      "Survived: 549\n",
      "Pclass: 0\n",
      "Name: 0\n",
      "Sex: 0\n",
      "Age: 0\n",
      "SibSp: 608\n",
      "Parch: 678\n",
      "Ticket: 0\n",
      "Fare: 15\n",
      "Cabin: 0\n",
      "Embarked: 0\n"
     ]
    }
   ],
   "source": [
    "# Dataset de Titanic\n",
    "\n",
    "# Identificar valores duplicados \n",
    "duplicados = df_titanic.duplicated().sum() \n",
    "print(f\"\\nValores duplicados: {duplicados}\") \n",
    "\n",
    "# Identificar valores faltantes \n",
    "valores_faltantes = df_titanic.isnull().sum() \n",
    "print(\"\\nValores faltantes por columna:\") \n",
    "print(valores_faltantes) \n",
    "\n",
    "# Mostrar Variables con ceros \n",
    "print(\"\\nCantidad de Ceros:\")\n",
    "for columna in columnas_titanic[:]:\n",
    "    num_zeros = (df_titanic[columna] == 0).sum()\n",
    "    print(f\"{columna}: {num_zeros}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf49145e",
   "metadata": {},
   "source": [
    "## 3. Manejo de Anomalías y datos faltantes\n",
    "### Diabetes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "f56d8b8f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Promedios de columnas excluyendo ceros (reemplazarán a los datos faltantes o nulos)\n",
      "                   Variable    Promedio\n",
      "0                   Glucose  121.686763\n",
      "1             BloodPressure   72.405184\n",
      "2             SkinThickness   29.153420\n",
      "3                   Insulin  155.548223\n",
      "4                       BMI   32.457464\n",
      "5  DiabetesPedigreeFunction  428.235091\n",
      "6                       Age   33.240885\n",
      "\n",
      "Reemplazo de los datos faltantes o nulos)\n",
      "   Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \\\n",
      "0            6    148.0           72.0       35.00000  155.548223  33.6   \n",
      "1            1     85.0           66.0       29.00000  155.548223  26.6   \n",
      "2            8    183.0           64.0       29.15342  155.548223  23.3   \n",
      "3            1     89.0           66.0       23.00000   94.000000  28.1   \n",
      "4            0    137.0           40.0       35.00000  168.000000  43.1   \n",
      "\n",
      "   DiabetesPedigreeFunction  Age  Diabetes  \n",
      "0                     627.0   50         1  \n",
      "1                     351.0   31         0  \n",
      "2                     672.0   32         1  \n",
      "3                     167.0   21         0  \n",
      "4                    2288.0   33         1  \n"
     ]
    }
   ],
   "source": [
    "medias=[]\n",
    "variables=[]\n",
    "print(\"Promedios de columnas excluyendo ceros (reemplazarán a los datos faltantes o nulos)\")\n",
    "for i in range (1,len(columnas_diabetes)-1):\n",
    "    variable=columnas_diabetes[i]\n",
    "    mean_excluding_zeros = df_diabetes[df_diabetes[variable] != 0][variable].mean()\n",
    "    variables.append(variable)\n",
    "    medias.append(mean_excluding_zeros)\n",
    "dfMedias=pd.DataFrame()\n",
    "dfMedias[\"Variable\"]=variables\n",
    "dfMedias[\"Promedio\"]=medias\n",
    "print(dfMedias)\n",
    "\n",
    "print(\"\\nReemplazo de los datos faltantes o nulos)\")\n",
    "# Reemplazar los ceros por el valor medio calculado\n",
    "\n",
    "columns_to_impute = df_diabetes.columns.difference(['Pregnancies', 'Diabetes'])\n",
    "# Calcular los promedios excluyendo los ceros para cada columna relevante\n",
    "mean_values = {}\n",
    "for column in columns_to_impute:\n",
    "    mean_values[column] = df_diabetes[df_diabetes[column] != 0][column].mean()\n",
    "\n",
    "# Reemplazar los ceros por los promedios calculados\n",
    "for column in columns_to_impute:\n",
    "    df_diabetes[column] = df_diabetes[column].replace(0, mean_values[column])\n",
    "\n",
    "# Mostrar los resultados\n",
    "print(df_diabetes.head())            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "73af76e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pregnancies</th>\n",
       "      <th>Glucose</th>\n",
       "      <th>BloodPressure</th>\n",
       "      <th>SkinThickness</th>\n",
       "      <th>Insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>DiabetesPedigreeFunction</th>\n",
       "      <th>Age</th>\n",
       "      <th>Diabetes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35.00000</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>33.6</td>\n",
       "      <td>627.00</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29.00000</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>26.6</td>\n",
       "      <td>351.00</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>29.15342</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>23.3</td>\n",
       "      <td>672.00</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23.00000</td>\n",
       "      <td>94.000000</td>\n",
       "      <td>28.1</td>\n",
       "      <td>167.00</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35.00000</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2288.00</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>763</th>\n",
       "      <td>10</td>\n",
       "      <td>101.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>48.00000</td>\n",
       "      <td>180.000000</td>\n",
       "      <td>32.9</td>\n",
       "      <td>171.00</td>\n",
       "      <td>63</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>764</th>\n",
       "      <td>2</td>\n",
       "      <td>122.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>27.00000</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>36.8</td>\n",
       "      <td>0.34</td>\n",
       "      <td>27</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>765</th>\n",
       "      <td>5</td>\n",
       "      <td>121.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>23.00000</td>\n",
       "      <td>112.000000</td>\n",
       "      <td>26.2</td>\n",
       "      <td>245.00</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>766</th>\n",
       "      <td>1</td>\n",
       "      <td>126.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>29.15342</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>30.1</td>\n",
       "      <td>349.00</td>\n",
       "      <td>47</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>1</td>\n",
       "      <td>93.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>31.00000</td>\n",
       "      <td>155.548223</td>\n",
       "      <td>30.4</td>\n",
       "      <td>315.00</td>\n",
       "      <td>23</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>768 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \\\n",
       "0              6    148.0           72.0       35.00000  155.548223  33.6   \n",
       "1              1     85.0           66.0       29.00000  155.548223  26.6   \n",
       "2              8    183.0           64.0       29.15342  155.548223  23.3   \n",
       "3              1     89.0           66.0       23.00000   94.000000  28.1   \n",
       "4              0    137.0           40.0       35.00000  168.000000  43.1   \n",
       "..           ...      ...            ...            ...         ...   ...   \n",
       "763           10    101.0           76.0       48.00000  180.000000  32.9   \n",
       "764            2    122.0           70.0       27.00000  155.548223  36.8   \n",
       "765            5    121.0           72.0       23.00000  112.000000  26.2   \n",
       "766            1    126.0           60.0       29.15342  155.548223  30.1   \n",
       "767            1     93.0           70.0       31.00000  155.548223  30.4   \n",
       "\n",
       "     DiabetesPedigreeFunction  Age  Diabetes  \n",
       "0                      627.00   50         1  \n",
       "1                      351.00   31         0  \n",
       "2                      672.00   32         1  \n",
       "3                      167.00   21         0  \n",
       "4                     2288.00   33         1  \n",
       "..                        ...  ...       ...  \n",
       "763                    171.00   63         0  \n",
       "764                      0.34   27         0  \n",
       "765                    245.00   30         0  \n",
       "766                    349.00   47         1  \n",
       "767                    315.00   23         0  \n",
       "\n",
       "[768 rows x 9 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diabetes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45ee240a",
   "metadata": {},
   "source": [
    "### Titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "b0d89593",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imputar datos faltantes \n",
    "# A faltantes de edad se imputa la mediana\n",
    "df_titanic['Age'].fillna(df_titanic['Age'].median(), inplace=True)\n",
    "\n",
    "# A faltantes de cabina, se le imputa “NC”\n",
    "df_titanic['Cabin'].fillna('NC', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "1e076b52",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NC</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NC</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NC</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>887</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>NC</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>888</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>889</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>28.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>NC</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>890</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>891</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>NC</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "0              1         0       3   \n",
       "1              2         1       1   \n",
       "2              3         1       3   \n",
       "3              4         1       1   \n",
       "4              5         0       3   \n",
       "..           ...       ...     ...   \n",
       "886          887         0       2   \n",
       "887          888         1       1   \n",
       "888          889         0       3   \n",
       "889          890         1       1   \n",
       "890          891         0       3   \n",
       "\n",
       "                                                  Name     Sex   Age  SibSp  \\\n",
       "0                              Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                               Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                             Allen, Mr. William Henry    male  35.0      0   \n",
       "..                                                 ...     ...   ...    ...   \n",
       "886                              Montvila, Rev. Juozas    male  27.0      0   \n",
       "887                       Graham, Miss. Margaret Edith  female  19.0      0   \n",
       "888           Johnston, Miss. Catherine Helen \"Carrie\"  female  28.0      1   \n",
       "889                              Behr, Mr. Karl Howell    male  26.0      0   \n",
       "890                                Dooley, Mr. Patrick    male  32.0      0   \n",
       "\n",
       "     Parch            Ticket     Fare Cabin Embarked  \n",
       "0        0         A/5 21171   7.2500    NC        S  \n",
       "1        0          PC 17599  71.2833   C85        C  \n",
       "2        0  STON/O2. 3101282   7.9250    NC        S  \n",
       "3        0            113803  53.1000  C123        S  \n",
       "4        0            373450   8.0500    NC        S  \n",
       "..     ...               ...      ...   ...      ...  \n",
       "886      0            211536  13.0000    NC        S  \n",
       "887      0            112053  30.0000   B42        S  \n",
       "888      2        W./C. 6607  23.4500    NC        S  \n",
       "889      0            111369  30.0000  C148        C  \n",
       "890      0            370376   7.7500    NC        Q  \n",
       "\n",
       "[891 rows x 12 columns]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8786b126",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Indexación\n",
    "### Uso de iloc en Titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "6e8aa3f0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Columnas 1 a 4:\n",
      "    Survived  Pclass                                               Name\n",
      "0         0       3                            Braund, Mr. Owen Harris\n",
      "1         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...\n",
      "2         1       3                             Heikkinen, Miss. Laina\n",
      "3         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)\n",
      "4         0       3                           Allen, Mr. William Henry\n",
      "\n",
      "Valor promedio de la columna 5 ( Age ): 29.36158249158249\n"
     ]
    }
   ],
   "source": [
    "# Seleccionar las columnas de la segunda a la cuarta  \n",
    "sub_df=df_titanic.iloc[0:5, 1:4]\n",
    "print(\"\\nColumnas 1 a 4:\\n\", sub_df)\n",
    "\n",
    "# Valor promedio de la columna 5\n",
    "valor=df_titanic.iloc[:,5].mean()\n",
    "print(\"\\nValor promedio de la columna 5 (\", str(df_titanic.columns.to_list()[5]), \"):\", str(valor))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bc7ae6c",
   "metadata": {},
   "source": [
    "### Ejemplo de loc en Diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "7c538a9c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Age  Glucose  Diabetes\n",
      "0   50    148.0         1\n",
      "1   31     85.0         0\n",
      "2   32    183.0         1\n",
      "3   21     89.0         0\n",
      "4   33    137.0         1\n",
      "\n",
      "     Age  Glucose  Diabetes\n",
      "8     53    197.0         1\n",
      "22    41    196.0         1\n",
      "206   57    196.0         1\n",
      "228   31    197.0         0\n",
      "359   29    196.0         1\n",
      "408   39    197.0         1\n",
      "561   28    198.0         1\n",
      "579   62    197.0         1\n",
      "661   22    199.0         1\n"
     ]
    }
   ],
   "source": [
    "# Seleccionar las primeras 5 filas y las columnas 'Age', 'Glucose' y 'Diabetes'\n",
    "selected_data = df_diabetes.loc[:4, ['Age', 'Glucose', 'Diabetes']]\n",
    "print(selected_data)\n",
    "\n",
    "print(\"\")\n",
    "# Seleccionar todas las donde glucose > 195 y las columnas 'Age', 'Glucose' y 'Diabetes'\n",
    "filtrado_Glucose = df_diabetes.loc[df_diabetes['Glucose'] > 195, ['Age', 'Glucose', 'Diabetes']]\n",
    "print(filtrado_Glucose)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1615a5d3",
   "metadata": {},
   "source": [
    "## Agrupacion y Agregacion\n",
    "### Agrupación sobre Diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "4d5549c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          Age  Glucose\n",
      "Diabetes              \n",
      "0          21    197.0\n",
      "1          21    199.0\n"
     ]
    }
   ],
   "source": [
    "# Agrupar por Diagnóstico de diabetes y calcular la menor edad y mayor glucosa\n",
    "agrupado_diabetes = df_diabetes.groupby('Diabetes').agg({'Age': 'min', 'Glucose': 'max'})\n",
    "print(agrupado_diabetes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42ff7400",
   "metadata": {},
   "source": [
    "## Uniones y Fusiones\n",
    "### Union (join) sobre Diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "8db12f53",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Datos médicos:\n",
      "      Glucose  BloodPressure  SkinThickness     Insulin   BMI  Age\n",
      "0      148.0           72.0       35.00000  155.548223  33.6   50\n",
      "1       85.0           66.0       29.00000  155.548223  26.6   31\n",
      "2      183.0           64.0       29.15342  155.548223  23.3   32\n",
      "3       89.0           66.0       23.00000   94.000000  28.1   21\n",
      "4      137.0           40.0       35.00000  168.000000  43.1   33\n",
      "..       ...            ...            ...         ...   ...  ...\n",
      "763    101.0           76.0       48.00000  180.000000  32.9   63\n",
      "764    122.0           70.0       27.00000  155.548223  36.8   27\n",
      "765    121.0           72.0       23.00000  112.000000  26.2   30\n",
      "766    126.0           60.0       29.15342  155.548223  30.1   47\n",
      "767     93.0           70.0       31.00000  155.548223  30.4   23\n",
      "\n",
      "[768 rows x 6 columns]\n",
      "\n",
      "Datos demográficos:\n",
      "      Age  Diabetes\n",
      "0     50         1\n",
      "1     31         0\n",
      "2     32         1\n",
      "3     21         0\n",
      "4     33         1\n",
      "..   ...       ...\n",
      "763   63         0\n",
      "764   27         0\n",
      "765   30         0\n",
      "766   47         1\n",
      "767   23         0\n",
      "\n",
      "[768 rows x 2 columns]\n",
      "\n",
      "\n",
      "Union de Dataframes\n",
      "     Glucose  BloodPressure  SkinThickness     Insulin   BMI  Age  Diabetes\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         1\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         0\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         0\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         0\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         1\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         1\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         1\n",
      "0     148.0           72.0       35.00000  155.548223  33.6   50         1\n",
      "21     99.0           84.0       29.15342  155.548223  35.4   50         1\n",
      "21     99.0           84.0       29.15342  155.548223  35.4   50         0\n"
     ]
    }
   ],
   "source": [
    "# Crear DataFrames adicionales para simular la unión \n",
    "df_medidas = df_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',  'Age']] \n",
    "print(\"Datos médicos:\\n\",df_medidas)\n",
    "df_demografico = df_diabetes[['Age', 'Diabetes']] \n",
    "print(\"\\nDatos demográficos:\\n\",df_demografico)\n",
    "print(\"\")\n",
    "# Realizar la unión \n",
    "unido_diabetes = df_medidas.join(df_demografico.set_index('Age'), on='Age', how='inner', lsuffix='_med', rsuffix='_demo') \n",
    "print(\"\\nUnion de Dataframes\\n\",unido_diabetes.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12e76f40",
   "metadata": {},
   "source": [
    "### Fusion (Merge) sobre Titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "10e476fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Pclass     Fare  Average_Fare\n",
      "0       3   7.2500     13.675550\n",
      "1       1  71.2833     84.154687\n",
      "2       3   7.9250     13.675550\n",
      "3       1  53.1000     84.154687\n",
      "4       3   8.0500     13.675550\n",
      "5       3   8.4583     13.675550\n",
      "6       1  51.8625     84.154687\n",
      "7       3  21.0750     13.675550\n",
      "8       3  11.1333     13.675550\n",
      "9       2  30.0708     20.662183\n"
     ]
    }
   ],
   "source": [
    "# Crear un DataFrame adicional para simular la fusión\n",
    "data_adicional = {\n",
    "    'Pclass': [1, 2, 3],\n",
    "    'Average_Fare': [84.154687, 20.662183, 13.675550]\n",
    "}\n",
    "df_adicional = pd.DataFrame(data_adicional)\n",
    "\n",
    "# Realizar la fusión\n",
    "fusionado_titanic = pd.merge(df_titanic, df_adicional, on='Pclass', how='left')\n",
    "print(fusionado_titanic[['Pclass', 'Fare', 'Average_Fare']].head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4474f90",
   "metadata": {},
   "source": [
    "## Discretizacion"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9471f002",
   "metadata": {},
   "source": [
    "### Titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "7c889e36",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Valores posibles por variable categorica:\n",
      "   Variable  valores posibles\n",
      "0      Name               891\n",
      "1       Sex                 2\n",
      "2     Cabin               148\n",
      "3    Ticket               681\n",
      "4  Embarked                 4\n"
     ]
    }
   ],
   "source": [
    "# Tratamiento de las variables categoricas:\n",
    "variables_categoricas=['Name','Sex','Cabin','Ticket','Embarked']\n",
    "\n",
    "# Cantidad de valores posibles (no numéricos) por variable\n",
    "df_cat=pd.DataFrame()\n",
    "val_posibles=[]\n",
    "for i in range(len(variables_categoricas)):\n",
    "        variable=variables_categoricas[i]\n",
    "        valor=len(list(set(list(df_titanic[variable]))))\n",
    "        val_posibles.append(valor)\n",
    "df_cat[\"Variable\"]=variables_categoricas\n",
    "df_cat[\"valores posibles\"]=val_posibles\n",
    "print(\"Valores posibles por variable categorica:\")\n",
    "print(df_cat)        "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da1ce",
   "metadata": {},
   "source": [
    "Con esta información definimos: eliminación de columnas Name, Cabin y Ticket, y haremos la discretización de Sex y Embarked"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "878f6662",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Pclass_1</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>1</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>1</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived   Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  Pclass_3  \\\n",
       "0           0  22.0      1      0   7.2500         0         0         1   \n",
       "1           1  38.0      1      0  71.2833         1         0         0   \n",
       "2           1  26.0      0      0   7.9250         0         0         1   \n",
       "3           1  35.0      1      0  53.1000         1         0         0   \n",
       "4           0  35.0      0      0   8.0500         0         0         1   \n",
       "..        ...   ...    ...    ...      ...       ...       ...       ...   \n",
       "886         0  27.0      0      0  13.0000         0         1         0   \n",
       "887         1  19.0      0      0  30.0000         1         0         0   \n",
       "888         0  28.0      1      2  23.4500         0         0         1   \n",
       "889         1  26.0      0      0  30.0000         1         0         0   \n",
       "890         0  32.0      0      0   7.7500         0         0         1   \n",
       "\n",
       "     Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  \n",
       "0             0         1           0           0           1  \n",
       "1             1         0           1           0           0  \n",
       "2             1         0           0           0           1  \n",
       "3             1         0           0           0           1  \n",
       "4             0         1           0           0           1  \n",
       "..          ...       ...         ...         ...         ...  \n",
       "886           0         1           0           0           1  \n",
       "887           1         0           0           0           1  \n",
       "888           1         0           0           0           1  \n",
       "889           0         1           1           0           0  \n",
       "890           0         1           0           1           0  \n",
       "\n",
       "[891 rows x 13 columns]"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Eliminacion de columnas\n",
    "df_titanic_cleaned = df_titanic.drop(columns=['Name', 'Cabin', 'Ticket', 'PassengerId'])\n",
    "\n",
    "# Discretizacion de variables:\n",
    "df_titanic_encoded=pd.get_dummies(df_titanic_cleaned,columns=['Pclass','Sex','Embarked'])\n",
    "df_titanic_encoded"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9de15e2",
   "metadata": {},
   "source": [
    "## Normalización\n",
    "### Diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "b16bcf3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Primeros registros del dataset normalizado:\n",
      "   Pregnancies   Glucose  BloodPressure  SkinThickness       Insulin  \\\n",
      "0     0.639947  0.865108      -0.033518   6.655021e-01 -3.345079e-16   \n",
      "1    -0.844885 -1.206162      -0.529859  -1.746338e-02 -3.345079e-16   \n",
      "2     1.233880  2.015813      -0.695306   8.087936e-16 -3.345079e-16   \n",
      "3    -0.844885 -1.074652      -0.529859  -7.004289e-01 -7.243887e-01   \n",
      "4    -1.141852  0.503458      -2.680669   6.655021e-01  1.465506e-01   \n",
      "\n",
      "        BMI  DiabetesPedigreeFunction       Age  Diabetes  \n",
      "0  0.166292                  0.584149  1.425995         1  \n",
      "1 -0.852531                 -0.226986 -0.190672         0  \n",
      "2 -1.332833                  0.716400 -0.105584         1  \n",
      "3 -0.634212                 -0.767743 -1.041549         0  \n",
      "4  1.548980                  5.465654 -0.020496         1  \n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Crear el escalador estándar\n",
    "scaler = StandardScaler()\n",
    "\n",
    "target = df_diabetes['Diabetes']\n",
    "\n",
    "# Ajustar el escalador y transformar los datos\n",
    "data_normalized = scaler.fit_transform(df_diabetes)\n",
    "\n",
    "# Convertir el resultado a un DataFrame y agregar de nuevo la columna 'Outcome'\n",
    "data_normalized = pd.DataFrame(data_normalized, columns=df_diabetes.columns)\n",
    "data_normalized['Diabetes'] = target\n",
    "\n",
    "# Mostrar los primeros registros del dataset normalizado\n",
    "print(\"Primeros registros del dataset normalizado:\")\n",
    "print(data_normalized.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d89bacb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}