{ "cells": [ { "cell_type": "markdown", "id": "410f9559", "metadata": {}, "source": [ "## Procesos ETL o Data Wrangling\n", "### 1. Carga de Datasets" ] }, { "cell_type": "code", "execution_count": 49, "id": "891b31e0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Tamaño: (768, 9)\n", "\n", "Columnas= ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Diabetes']\n", "\n", "Tipos de variable:\n", "\n", "RangeIndex: 768 entries, 0 to 767\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Pregnancies 768 non-null int64 \n", " 1 Glucose 768 non-null int64 \n", " 2 BloodPressure 768 non-null int64 \n", " 3 SkinThickness 768 non-null int64 \n", " 4 Insulin 768 non-null int64 \n", " 5 BMI 768 non-null float64\n", " 6 DiabetesPedigreeFunction 768 non-null float64\n", " 7 Age 768 non-null int64 \n", " 8 Diabetes 768 non-null int64 \n", "dtypes: float64(2), int64(7)\n", "memory usage: 54.1 KB\n", "None\n", "\n", "Estadísticos:\n", " Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n", "count 768.000000 768.000000 768.000000 768.000000 768.000000 \n", "mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n", "std 3.369578 31.972618 19.355807 15.952218 115.244002 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n", "50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n", "75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n", "max 17.000000 199.000000 122.000000 99.000000 846.000000 \n", "\n", " BMI DiabetesPedigreeFunction Age Diabetes \n", "count 768.000000 768.000000 768.000000 768.000000 \n", "mean 31.992578 428.235091 33.240885 0.348958 \n", "std 7.884160 340.485655 11.760232 0.476951 \n", "min 0.000000 0.100000 21.000000 0.000000 \n", "25% 27.300000 205.000000 24.000000 0.000000 \n", "50% 32.000000 337.000000 29.000000 0.000000 \n", "75% 36.600000 591.500000 41.000000 1.000000 \n", "max 67.100000 2329.000000 81.000000 1.000000 \n", " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148 72 35 0 33.6 \n", "1 1 85 66 29 0 26.6 \n", "2 8 183 64 0 0 23.3 \n", "3 1 89 66 23 94 28.1 \n", "4 0 137 40 35 168 43.1 \n", "\n", " DiabetesPedigreeFunction Age Diabetes \n", "0 627.0 50 1 \n", "1 351.0 31 0 \n", "2 672.0 32 1 \n", "3 167.0 21 0 \n", "4 2288.0 33 1 \n" ] } ], "source": [ "# Diabetes\n", "#\n", "import pandas as pd\n", "import os\n", "separador=os.sep\n", "\n", "# Path del archivo\n", "carpeta=str(os.path.join(os.getcwd(),\"datasets\"))\n", "archivo=str(os.path.join(carpeta,\"diabetes.csv\"))\n", "\n", "# Convertir archivo a DataFrame de Pandas\n", "df_diabetes = pd.read_csv(archivo)\n", "\n", "# Tamaño del datast\n", "print(\"\\nTamaño: \",df_diabetes.shape)\n", "\n", "# Columnas\n", "columnas_diabetes=df_diabetes.columns.tolist()\n", "print(\"\\nColumnas=\",columnas_diabetes)\n", "\n", "# Tipos de variables\n", "print(\"\\nTipos de variable:\")\n", "print(df_diabetes.info())\n", "\n", "# Estadisticos:\n", "print(\"\\nEstadísticos:\")\n", "print(df_diabetes.describe())\n", "print(df_diabetes.head())\n" ] }, { "cell_type": "code", "execution_count": 50, "id": "bc1641a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Tamaño: (891, 12)\n", "\n", "Columnas= ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']\n", "\n", "Tipos de variable:\n", "\n", "RangeIndex: 891 entries, 0 to 890\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 PassengerId 891 non-null int64 \n", " 1 Survived 891 non-null int64 \n", " 2 Pclass 891 non-null int64 \n", " 3 Name 891 non-null object \n", " 4 Sex 891 non-null object \n", " 5 Age 714 non-null float64\n", " 6 SibSp 891 non-null int64 \n", " 7 Parch 891 non-null int64 \n", " 8 Ticket 891 non-null object \n", " 9 Fare 891 non-null float64\n", " 10 Cabin 204 non-null object \n", " 11 Embarked 889 non-null object \n", "dtypes: float64(2), int64(5), object(5)\n", "memory usage: 83.7+ KB\n", "None\n", "\n", "Estadísticos:\n", " PassengerId Survived Pclass Age SibSp \\\n", "count 891.000000 891.000000 891.000000 714.000000 891.000000 \n", "mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n", "std 257.353842 0.486592 0.836071 14.526497 1.102743 \n", "min 1.000000 0.000000 1.000000 0.420000 0.000000 \n", "25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n", "50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n", "75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n", "max 891.000000 1.000000 3.000000 80.000000 8.000000 \n", "\n", " Parch Fare \n", "count 891.000000 891.000000 \n", "mean 0.381594 32.204208 \n", "std 0.806057 49.693429 \n", "min 0.000000 0.000000 \n", "25% 0.000000 7.910400 \n", "50% 0.000000 14.454200 \n", "75% 0.000000 31.000000 \n", "max 6.000000 512.329200 \n" ] } ], "source": [ "# Titanic\n", "#\n", "# URL del dataset en GitHub\n", "url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'\n", "\n", "# Cargar el dataset desde la URL\n", "df_titanic = pd.read_csv(url)\n", "\n", "# Tamaño del datast\n", "print(\"\\nTamaño: \",df_titanic.shape)\n", "\n", "# Columnas\n", "columnas_titanic=df_titanic.columns.tolist()\n", "print(\"\\nColumnas=\",columnas_titanic)\n", "\n", "# Tipos de variables\n", "print(\"\\nTipos de variable:\")\n", "print(df_titanic.info())\n", "\n", "# Estadisticos:\n", "print(\"\\nEstadísticos:\")\n", "print(df_titanic.describe())\n", "\n", "# Guardar CSV\n", "nombre_archivo=\"datasets\"+str(separador)+\"titanic.csv\"\n", "df_titanic.to_csv(nombre_archivo)" ] }, { "cell_type": "markdown", "id": "5ac831e7", "metadata": {}, "source": [ "### 2. Inconsistencias y anomalias" ] }, { "cell_type": "code", "execution_count": 51, "id": "fea3f609", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Valores duplicados: 0\n", "\n", "Valores faltantes por columna:\n", "Pregnancies 0\n", "Glucose 0\n", "BloodPressure 0\n", "SkinThickness 0\n", "Insulin 0\n", "BMI 0\n", "DiabetesPedigreeFunction 0\n", "Age 0\n", "Diabetes 0\n", "dtype: int64\n", "\n", "Cantidad de Ceros:\n", "Glucose: 5\n", "BloodPressure: 35\n", "SkinThickness: 227\n", "Insulin: 374\n", "BMI: 11\n", "DiabetesPedigreeFunction: 0\n", "Age: 0\n", "Diabetes: 500\n" ] } ], "source": [ "# Dataset de Diabetes\n", "\n", "# Identificar valores duplicados \n", "duplicados = df_diabetes.duplicated().sum() \n", "print(f\"\\nValores duplicados: {duplicados}\") \n", "\n", "# Identificar valores faltantes \n", "valores_faltantes = df_diabetes.isnull().sum() \n", "print(\"\\nValores faltantes por columna:\") \n", "print(valores_faltantes) \n", "\n", "# Mostrar Variables con ceros \n", "print(\"\\nCantidad de Ceros:\")\n", "for columna in columnas_diabetes[1:]:\n", " num_zeros = (df_diabetes[columna] == 0).sum()\n", " print(f\"{columna}: {num_zeros}\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "e9f7a90b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Valores duplicados: 0\n", "\n", "Valores faltantes por columna:\n", "PassengerId 0\n", "Survived 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 177\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 687\n", "Embarked 2\n", "dtype: int64\n", "\n", "Cantidad de Ceros:\n", "PassengerId: 0\n", "Survived: 549\n", "Pclass: 0\n", "Name: 0\n", "Sex: 0\n", "Age: 0\n", "SibSp: 608\n", "Parch: 678\n", "Ticket: 0\n", "Fare: 15\n", "Cabin: 0\n", "Embarked: 0\n" ] } ], "source": [ "# Dataset de Titanic\n", "\n", "# Identificar valores duplicados \n", "duplicados = df_titanic.duplicated().sum() \n", "print(f\"\\nValores duplicados: {duplicados}\") \n", "\n", "# Identificar valores faltantes \n", "valores_faltantes = df_titanic.isnull().sum() \n", "print(\"\\nValores faltantes por columna:\") \n", "print(valores_faltantes) \n", "\n", "# Mostrar Variables con ceros \n", "print(\"\\nCantidad de Ceros:\")\n", "for columna in columnas_titanic[:]:\n", " num_zeros = (df_titanic[columna] == 0).sum()\n", " print(f\"{columna}: {num_zeros}\")" ] }, { "cell_type": "markdown", "id": "cf49145e", "metadata": {}, "source": [ "## 3. Manejo de Anomalías y datos faltantes\n", "### Diabetes:" ] }, { "cell_type": "code", "execution_count": 53, "id": "f56d8b8f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Promedios de columnas excluyendo ceros (reemplazarán a los datos faltantes o nulos)\n", " Variable Promedio\n", "0 Glucose 121.686763\n", "1 BloodPressure 72.405184\n", "2 SkinThickness 29.153420\n", "3 Insulin 155.548223\n", "4 BMI 32.457464\n", "5 DiabetesPedigreeFunction 428.235091\n", "6 Age 33.240885\n", "\n", "Reemplazo de los datos faltantes o nulos)\n", " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148.0 72.0 35.00000 155.548223 33.6 \n", "1 1 85.0 66.0 29.00000 155.548223 26.6 \n", "2 8 183.0 64.0 29.15342 155.548223 23.3 \n", "3 1 89.0 66.0 23.00000 94.000000 28.1 \n", "4 0 137.0 40.0 35.00000 168.000000 43.1 \n", "\n", " DiabetesPedigreeFunction Age Diabetes \n", "0 627.0 50 1 \n", "1 351.0 31 0 \n", "2 672.0 32 1 \n", "3 167.0 21 0 \n", "4 2288.0 33 1 \n" ] } ], "source": [ "medias=[]\n", "variables=[]\n", "print(\"Promedios de columnas excluyendo ceros (reemplazarán a los datos faltantes o nulos)\")\n", "for i in range (1,len(columnas_diabetes)-1):\n", " variable=columnas_diabetes[i]\n", " mean_excluding_zeros = df_diabetes[df_diabetes[variable] != 0][variable].mean()\n", " variables.append(variable)\n", " medias.append(mean_excluding_zeros)\n", "dfMedias=pd.DataFrame()\n", "dfMedias[\"Variable\"]=variables\n", "dfMedias[\"Promedio\"]=medias\n", "print(dfMedias)\n", "\n", "print(\"\\nReemplazo de los datos faltantes o nulos)\")\n", "# Reemplazar los ceros por el valor medio calculado\n", "\n", "columns_to_impute = df_diabetes.columns.difference(['Pregnancies', 'Diabetes'])\n", "# Calcular los promedios excluyendo los ceros para cada columna relevante\n", "mean_values = {}\n", "for column in columns_to_impute:\n", " mean_values[column] = df_diabetes[df_diabetes[column] != 0][column].mean()\n", "\n", "# Reemplazar los ceros por los promedios calculados\n", "for column in columns_to_impute:\n", " df_diabetes[column] = df_diabetes[column].replace(0, mean_values[column])\n", "\n", "# Mostrar los resultados\n", "print(df_diabetes.head()) " ] }, { "cell_type": "code", "execution_count": 54, "id": "73af76e9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeDiabetes
06148.072.035.00000155.54822333.6627.00501
1185.066.029.00000155.54822326.6351.00310
28183.064.029.15342155.54822323.3672.00321
3189.066.023.0000094.00000028.1167.00210
40137.040.035.00000168.00000043.12288.00331
..............................
76310101.076.048.00000180.00000032.9171.00630
7642122.070.027.00000155.54822336.80.34270
7655121.072.023.00000112.00000026.2245.00300
7661126.060.029.15342155.54822330.1349.00471
767193.070.031.00000155.54822330.4315.00230
\n", "

768 rows × 9 columns

\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148.0 72.0 35.00000 155.548223 33.6 \n", "1 1 85.0 66.0 29.00000 155.548223 26.6 \n", "2 8 183.0 64.0 29.15342 155.548223 23.3 \n", "3 1 89.0 66.0 23.00000 94.000000 28.1 \n", "4 0 137.0 40.0 35.00000 168.000000 43.1 \n", ".. ... ... ... ... ... ... \n", "763 10 101.0 76.0 48.00000 180.000000 32.9 \n", "764 2 122.0 70.0 27.00000 155.548223 36.8 \n", "765 5 121.0 72.0 23.00000 112.000000 26.2 \n", "766 1 126.0 60.0 29.15342 155.548223 30.1 \n", "767 1 93.0 70.0 31.00000 155.548223 30.4 \n", "\n", " DiabetesPedigreeFunction Age Diabetes \n", "0 627.00 50 1 \n", "1 351.00 31 0 \n", "2 672.00 32 1 \n", "3 167.00 21 0 \n", "4 2288.00 33 1 \n", ".. ... ... ... \n", "763 171.00 63 0 \n", "764 0.34 27 0 \n", "765 245.00 30 0 \n", "766 349.00 47 1 \n", "767 315.00 23 0 \n", "\n", "[768 rows x 9 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_diabetes" ] }, { "cell_type": "markdown", "id": "45ee240a", "metadata": {}, "source": [ "### Titanic" ] }, { "cell_type": "code", "execution_count": 70, "id": "b0d89593", "metadata": {}, "outputs": [], "source": [ "# Imputar datos faltantes \n", "# A faltantes de edad se imputa la mediana\n", "df_titanic['Age'].fillna(df_titanic['Age'].median(), inplace=True)\n", "\n", "# A faltantes de cabina, se le imputa “NC”\n", "df_titanic['Cabin'].fillna('NC', inplace=True)" ] }, { "cell_type": "code", "execution_count": 71, "id": "1e076b52", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NCS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NCS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NCS
.......................................
88688702Montvila, Rev. Juozasmale27.00021153613.0000NCS
88788811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88888903Johnston, Miss. Catherine Helen \"Carrie\"female28.012W./C. 660723.4500NCS
88989011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89089103Dooley, Mr. Patrickmale32.0003703767.7500NCQ
\n", "

891 rows × 12 columns

\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", ".. ... ... ... \n", "886 887 0 2 \n", "887 888 1 1 \n", "888 889 0 3 \n", "889 890 1 1 \n", "890 891 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", ".. ... ... ... ... \n", "886 Montvila, Rev. Juozas male 27.0 0 \n", "887 Graham, Miss. Margaret Edith female 19.0 0 \n", "888 Johnston, Miss. Catherine Helen \"Carrie\" female 28.0 1 \n", "889 Behr, Mr. Karl Howell male 26.0 0 \n", "890 Dooley, Mr. Patrick male 32.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NC S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NC S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NC S \n", ".. ... ... ... ... ... \n", "886 0 211536 13.0000 NC S \n", "887 0 112053 30.0000 B42 S \n", "888 2 W./C. 6607 23.4500 NC S \n", "889 0 111369 30.0000 C148 C \n", "890 0 370376 7.7500 NC Q \n", "\n", "[891 rows x 12 columns]" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_titanic" ] }, { "cell_type": "code", "execution_count": null, "id": "8786b126", "metadata": {}, "outputs": [], "source": [ "## Indexación\n", "### Uso de iloc en Titanic" ] }, { "cell_type": "code", "execution_count": 88, "id": "6e8aa3f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Columnas 1 a 4:\n", " Survived Pclass Name\n", "0 0 3 Braund, Mr. Owen Harris\n", "1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n", "2 1 3 Heikkinen, Miss. Laina\n", "3 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", "4 0 3 Allen, Mr. William Henry\n", "\n", "Valor promedio de la columna 5 ( Age ): 29.36158249158249\n" ] } ], "source": [ "# Seleccionar las columnas de la segunda a la cuarta \n", "sub_df=df_titanic.iloc[0:5, 1:4]\n", "print(\"\\nColumnas 1 a 4:\\n\", sub_df)\n", "\n", "# Valor promedio de la columna 5\n", "valor=df_titanic.iloc[:,5].mean()\n", "print(\"\\nValor promedio de la columna 5 (\", str(df_titanic.columns.to_list()[5]), \"):\", str(valor))" ] }, { "cell_type": "markdown", "id": "7bc7ae6c", "metadata": {}, "source": [ "### Ejemplo de loc en Diabetes" ] }, { "cell_type": "code", "execution_count": 104, "id": "7c538a9c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Age Glucose Diabetes\n", "0 50 148.0 1\n", "1 31 85.0 0\n", "2 32 183.0 1\n", "3 21 89.0 0\n", "4 33 137.0 1\n", "\n", " Age Glucose Diabetes\n", "8 53 197.0 1\n", "22 41 196.0 1\n", "206 57 196.0 1\n", "228 31 197.0 0\n", "359 29 196.0 1\n", "408 39 197.0 1\n", "561 28 198.0 1\n", "579 62 197.0 1\n", "661 22 199.0 1\n" ] } ], "source": [ "# Seleccionar las primeras 5 filas y las columnas 'Age', 'Glucose' y 'Diabetes'\n", "selected_data = df_diabetes.loc[:4, ['Age', 'Glucose', 'Diabetes']]\n", "print(selected_data)\n", "\n", "print(\"\")\n", "# Seleccionar todas las donde glucose > 195 y las columnas 'Age', 'Glucose' y 'Diabetes'\n", "filtrado_Glucose = df_diabetes.loc[df_diabetes['Glucose'] > 195, ['Age', 'Glucose', 'Diabetes']]\n", "print(filtrado_Glucose)" ] }, { "cell_type": "markdown", "id": "1615a5d3", "metadata": {}, "source": [ "## Agrupacion y Agregacion\n", "### Agrupación sobre Diabetes" ] }, { "cell_type": "code", "execution_count": 105, "id": "4d5549c1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Age Glucose\n", "Diabetes \n", "0 21 197.0\n", "1 21 199.0\n" ] } ], "source": [ "# Agrupar por Diagnóstico de diabetes y calcular la menor edad y mayor glucosa\n", "agrupado_diabetes = df_diabetes.groupby('Diabetes').agg({'Age': 'min', 'Glucose': 'max'})\n", "print(agrupado_diabetes)" ] }, { "cell_type": "markdown", "id": "42ff7400", "metadata": {}, "source": [ "## Uniones y Fusiones\n", "### Union (join) sobre Diabetes" ] }, { "cell_type": "code", "execution_count": 111, "id": "8db12f53", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Datos médicos:\n", " Glucose BloodPressure SkinThickness Insulin BMI Age\n", "0 148.0 72.0 35.00000 155.548223 33.6 50\n", "1 85.0 66.0 29.00000 155.548223 26.6 31\n", "2 183.0 64.0 29.15342 155.548223 23.3 32\n", "3 89.0 66.0 23.00000 94.000000 28.1 21\n", "4 137.0 40.0 35.00000 168.000000 43.1 33\n", ".. ... ... ... ... ... ...\n", "763 101.0 76.0 48.00000 180.000000 32.9 63\n", "764 122.0 70.0 27.00000 155.548223 36.8 27\n", "765 121.0 72.0 23.00000 112.000000 26.2 30\n", "766 126.0 60.0 29.15342 155.548223 30.1 47\n", "767 93.0 70.0 31.00000 155.548223 30.4 23\n", "\n", "[768 rows x 6 columns]\n", "\n", "Datos demográficos:\n", " Age Diabetes\n", "0 50 1\n", "1 31 0\n", "2 32 1\n", "3 21 0\n", "4 33 1\n", ".. ... ...\n", "763 63 0\n", "764 27 0\n", "765 30 0\n", "766 47 1\n", "767 23 0\n", "\n", "[768 rows x 2 columns]\n", "\n", "\n", "Union de Dataframes\n", " Glucose BloodPressure SkinThickness Insulin BMI Age Diabetes\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 1\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 0\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 0\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 0\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 1\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 1\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 1\n", "0 148.0 72.0 35.00000 155.548223 33.6 50 1\n", "21 99.0 84.0 29.15342 155.548223 35.4 50 1\n", "21 99.0 84.0 29.15342 155.548223 35.4 50 0\n" ] } ], "source": [ "# Crear DataFrames adicionales para simular la unión \n", "df_medidas = df_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']] \n", "print(\"Datos médicos:\\n\",df_medidas)\n", "df_demografico = df_diabetes[['Age', 'Diabetes']] \n", "print(\"\\nDatos demográficos:\\n\",df_demografico)\n", "print(\"\")\n", "# Realizar la unión \n", "unido_diabetes = df_medidas.join(df_demografico.set_index('Age'), on='Age', how='inner', lsuffix='_med', rsuffix='_demo') \n", "print(\"\\nUnion de Dataframes\\n\",unido_diabetes.head(10))" ] }, { "cell_type": "markdown", "id": "12e76f40", "metadata": {}, "source": [ "### Fusion (Merge) sobre Titanic" ] }, { "cell_type": "code", "execution_count": 121, "id": "10e476fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Pclass Fare Average_Fare\n", "0 3 7.2500 13.675550\n", "1 1 71.2833 84.154687\n", "2 3 7.9250 13.675550\n", "3 1 53.1000 84.154687\n", "4 3 8.0500 13.675550\n", "5 3 8.4583 13.675550\n", "6 1 51.8625 84.154687\n", "7 3 21.0750 13.675550\n", "8 3 11.1333 13.675550\n", "9 2 30.0708 20.662183\n" ] } ], "source": [ "# Crear un DataFrame adicional para simular la fusión\n", "data_adicional = {\n", " 'Pclass': [1, 2, 3],\n", " 'Average_Fare': [84.154687, 20.662183, 13.675550]\n", "}\n", "df_adicional = pd.DataFrame(data_adicional)\n", "\n", "# Realizar la fusión\n", "fusionado_titanic = pd.merge(df_titanic, df_adicional, on='Pclass', how='left')\n", "print(fusionado_titanic[['Pclass', 'Fare', 'Average_Fare']].head(10))" ] }, { "cell_type": "markdown", "id": "c4474f90", "metadata": {}, "source": [ "## Discretizacion" ] }, { "cell_type": "markdown", "id": "9471f002", "metadata": {}, "source": [ "### Titanic" ] }, { "cell_type": "code", "execution_count": 69, "id": "7c889e36", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Valores posibles por variable categorica:\n", " Variable valores posibles\n", "0 Name 891\n", "1 Sex 2\n", "2 Cabin 148\n", "3 Ticket 681\n", "4 Embarked 4\n" ] } ], "source": [ "# Tratamiento de las variables categoricas:\n", "variables_categoricas=['Name','Sex','Cabin','Ticket','Embarked']\n", "\n", "# Cantidad de valores posibles (no numéricos) por variable\n", "df_cat=pd.DataFrame()\n", "val_posibles=[]\n", "for i in range(len(variables_categoricas)):\n", " variable=variables_categoricas[i]\n", " valor=len(list(set(list(df_titanic[variable]))))\n", " val_posibles.append(valor)\n", "df_cat[\"Variable\"]=variables_categoricas\n", "df_cat[\"valores posibles\"]=val_posibles\n", "print(\"Valores posibles por variable categorica:\")\n", "print(df_cat) " ] }, { "cell_type": "markdown", "id": "216da1ce", "metadata": {}, "source": [ "Con esta información definimos: eliminación de columnas Name, Cabin y Ticket, y haremos la discretización de Sex y Embarked" ] }, { "cell_type": "code", "execution_count": 127, "id": "878f6662", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedAgeSibSpParchFarePclass_1Pclass_2Pclass_3Sex_femaleSex_maleEmbarked_CEmbarked_QEmbarked_S
0022.0107.250000101001
1138.01071.283310010100
2126.0007.925000110001
3135.01053.100010010001
4035.0008.050000101001
..........................................
886027.00013.000001001001
887119.00030.000010010001
888028.01223.450000110001
889126.00030.000010001100
890032.0007.750000101010
\n", "

891 rows × 13 columns

\n", "
" ], "text/plain": [ " Survived Age SibSp Parch Fare Pclass_1 Pclass_2 Pclass_3 \\\n", "0 0 22.0 1 0 7.2500 0 0 1 \n", "1 1 38.0 1 0 71.2833 1 0 0 \n", "2 1 26.0 0 0 7.9250 0 0 1 \n", "3 1 35.0 1 0 53.1000 1 0 0 \n", "4 0 35.0 0 0 8.0500 0 0 1 \n", ".. ... ... ... ... ... ... ... ... \n", "886 0 27.0 0 0 13.0000 0 1 0 \n", "887 1 19.0 0 0 30.0000 1 0 0 \n", "888 0 28.0 1 2 23.4500 0 0 1 \n", "889 1 26.0 0 0 30.0000 1 0 0 \n", "890 0 32.0 0 0 7.7500 0 0 1 \n", "\n", " Sex_female Sex_male Embarked_C Embarked_Q Embarked_S \n", "0 0 1 0 0 1 \n", "1 1 0 1 0 0 \n", "2 1 0 0 0 1 \n", "3 1 0 0 0 1 \n", "4 0 1 0 0 1 \n", ".. ... ... ... ... ... \n", "886 0 1 0 0 1 \n", "887 1 0 0 0 1 \n", "888 1 0 0 0 1 \n", "889 0 1 1 0 0 \n", "890 0 1 0 1 0 \n", "\n", "[891 rows x 13 columns]" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Eliminacion de columnas\n", "df_titanic_cleaned = df_titanic.drop(columns=['Name', 'Cabin', 'Ticket', 'PassengerId'])\n", "\n", "# Discretizacion de variables:\n", "df_titanic_encoded=pd.get_dummies(df_titanic_cleaned,columns=['Pclass','Sex','Embarked'])\n", "df_titanic_encoded" ] }, { "cell_type": "markdown", "id": "b9de15e2", "metadata": {}, "source": [ "## Normalización\n", "### Diabetes" ] }, { "cell_type": "code", "execution_count": 132, "id": "b16bcf3c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Primeros registros del dataset normalizado:\n", " Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n", "0 0.639947 0.865108 -0.033518 6.655021e-01 -3.345079e-16 \n", "1 -0.844885 -1.206162 -0.529859 -1.746338e-02 -3.345079e-16 \n", "2 1.233880 2.015813 -0.695306 8.087936e-16 -3.345079e-16 \n", "3 -0.844885 -1.074652 -0.529859 -7.004289e-01 -7.243887e-01 \n", "4 -1.141852 0.503458 -2.680669 6.655021e-01 1.465506e-01 \n", "\n", " BMI DiabetesPedigreeFunction Age Diabetes \n", "0 0.166292 0.584149 1.425995 1 \n", "1 -0.852531 -0.226986 -0.190672 0 \n", "2 -1.332833 0.716400 -0.105584 1 \n", "3 -0.634212 -0.767743 -1.041549 0 \n", "4 1.548980 5.465654 -0.020496 1 \n" ] } ], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "# Crear el escalador estándar\n", "scaler = StandardScaler()\n", "\n", "target = df_diabetes['Diabetes']\n", "\n", "# Ajustar el escalador y transformar los datos\n", "data_normalized = scaler.fit_transform(df_diabetes)\n", "\n", "# Convertir el resultado a un DataFrame y agregar de nuevo la columna 'Outcome'\n", "data_normalized = pd.DataFrame(data_normalized, columns=df_diabetes.columns)\n", "data_normalized['Diabetes'] = target\n", "\n", "# Mostrar los primeros registros del dataset normalizado\n", "print(\"Primeros registros del dataset normalizado:\")\n", "print(data_normalized.head())" ] }, { "cell_type": "code", "execution_count": null, "id": "5d89bacb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }