You are on page 1of 13

Análisis expploratorio

Nombre del estudiante

22 de abril de 2020

library(tidyverse)
library(dplyr)
library(ggplot2)
library(funModeling)
library(corrr)
library(Hmisc)
library(randomForest)

Análisis Exploratorio Inicial


df_status(heart_disease) # Análisis de la estructura del conjunto de datos

## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique


## 1 age 0 0.00 0 0.00 0 0 integer 41
## 2 gender 0 0.00 0 0.00 0 0 factor 2
## 3 chest_pain 0 0.00 0 0.00 0 0 factor 4
## 4 resting_blood_pressure 0 0.00 0 0.00 0 0 integer 50
## 5 serum_cholestoral 0 0.00 0 0.00 0 0 integer 152
## 6 fasting_blood_sugar 258 85.15 0 0.00 0 0 factor 2
## 7 resting_electro 151 49.83 0 0.00 0 0 factor 3
## 8 max_heart_rate 0 0.00 0 0.00 0 0 integer 91
## 9 exer_angina 204 67.33 0 0.00 0 0 integer 2
## 10 oldpeak 99 32.67 0 0.00 0 0 numeric 40
## 11 slope 0 0.00 0 0.00 0 0 integer 3
## 12 num_vessels_flour 176 58.09 4 1.32 0 0 integer 4
## 13 thal 0 0.00 2 0.66 0 0 factor 3
## 14 heart_disease_severity 164 54.13 0 0.00 0 0 integer 5
## 15 exter_angina 204 67.33 0 0.00 0 0 factor 2
## 16 has_heart_disease 0 0.00 0 0.00 0 0 factor 2

Análisis univariado de variables categóricas y numéricas


plot_num(heart_disease) # Análisis para variables numéricas (gráficos)

1
age resting_blood_pressure serum_cholestoral
200
150
100
50
0
30 40 50 60 70 90 120 150 180 210
100 200 300 400 500 600

max_heart_rate exer_angina oldpeak


200
150
count

100
50
0
100 150 200 0.00 0.25 0.50 0.75 1.00 0 2 4 6

slope num_vessels_flour heart_disease_severity


200
150
100
50
0
1.0 1.5 2.0 2.5 3.0 0 1 2 3 0 1 2 3 4
value

Análisis cuantitativo
summary(heart_disease)

## age gender chest_pain resting_blood_pressure


## Min. :29.00 female: 97 1: 23 Min. : 94.0
## 1st Qu.:48.00 male :206 2: 50 1st Qu.:120.0
## Median :56.00 3: 86 Median :130.0
## Mean :54.44 4:144 Mean :131.7
## 3rd Qu.:61.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
##
## serum_cholestoral fasting_blood_sugar resting_electro max_heart_rate
## Min. :126.0 0:258 0:151 Min. : 71.0
## 1st Qu.:211.0 1: 45 1: 4 1st Qu.:133.5
## Median :241.0 2:148 Median :153.0
## Mean :246.7 Mean :149.6
## 3rd Qu.:275.0 3rd Qu.:166.0
## Max. :564.0 Max. :202.0
##
## exer_angina oldpeak slope num_vessels_flour thal
## Min. :0.0000 Min. :0.00 Min. :1.000 Min. :0.0000 3 :166
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000 6 : 18
## Median :0.0000 Median :0.80 Median :2.000 Median :0.0000 7 :117
## Mean :0.3267 Mean :1.04 Mean :1.601 Mean :0.6722 NA's: 2
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000

2
## Max. :1.0000 Max. :6.20 Max. :3.000 Max. :3.0000
## NA's :4
## heart_disease_severity exter_angina has_heart_disease
## Min. :0.0000 0:204 no :164
## 1st Qu.:0.0000 1: 99 yes:139
## Median :0.0000
## Mean :0.9373
## 3rd Qu.:2.0000
## Max. :4.0000
##
describe(heart_disease) #Análisis numérico y categórico (cuantitativo)

## heart_disease
##
## 16 Variables 303 Observations
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 303 0 41 0.999 54.44 10.3 40 42
## .25 .50 .75 .90 .95
## 48 56 61 66 68
##
## lowest : 29 34 35 37 38, highest: 70 71 74 76 77
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 303 0 2
##
## Value female male
## Frequency 97 206
## Proportion 0.32 0.68
## --------------------------------------------------------------------------------
## chest_pain
## n missing distinct
## 303 0 4
##
## Value 1 2 3 4
## Frequency 23 50 86 144
## Proportion 0.076 0.165 0.284 0.475
## --------------------------------------------------------------------------------
## resting_blood_pressure
## n missing distinct Info Mean Gmd .05 .10
## 303 0 50 0.995 131.7 19.41 108 110
## .25 .50 .75 .90 .95
## 120 130 140 152 160
##
## lowest : 94 100 101 102 104, highest: 174 178 180 192 200
## --------------------------------------------------------------------------------
## serum_cholestoral
## n missing distinct Info Mean Gmd .05 .10
## 303 0 152 1 246.7 55.91 175.1 188.8
## .25 .50 .75 .90 .95
## 211.0 241.0 275.0 308.8 326.9
##

3
## lowest : 126 131 141 149 157, highest: 394 407 409 417 564
## --------------------------------------------------------------------------------
## fasting_blood_sugar
## n missing distinct
## 303 0 2
##
## Value 0 1
## Frequency 258 45
## Proportion 0.851 0.149
## --------------------------------------------------------------------------------
## resting_electro
## n missing distinct
## 303 0 3
##
## Value 0 1 2
## Frequency 151 4 148
## Proportion 0.498 0.013 0.488
## --------------------------------------------------------------------------------
## max_heart_rate
## n missing distinct Info Mean Gmd .05 .10
## 303 0 91 1 149.6 25.73 108.1 116.0
## .25 .50 .75 .90 .95
## 133.5 153.0 166.0 176.6 181.9
##
## lowest : 71 88 90 95 96, highest: 190 192 194 195 202
## --------------------------------------------------------------------------------
## exer_angina
## n missing distinct Info Sum Mean Gmd
## 303 0 2 0.66 99 0.3267 0.4414
##
## --------------------------------------------------------------------------------
## oldpeak
## n missing distinct Info Mean Gmd .05 .10
## 303 0 40 0.964 1.04 1.225 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.8 1.6 2.8 3.4
##
## lowest : 0.0 0.1 0.2 0.3 0.4, highest: 4.0 4.2 4.4 5.6 6.2
## --------------------------------------------------------------------------------
## slope
## n missing distinct Info Mean Gmd
## 303 0 3 0.798 1.601 0.6291
##
## Value 1 2 3
## Frequency 142 140 21
## Proportion 0.469 0.462 0.069
## --------------------------------------------------------------------------------
## num_vessels_flour
## n missing distinct Info Mean Gmd
## 299 4 4 0.783 0.6722 0.9249
##
## Value 0 1 2 3
## Frequency 176 65 38 20
## Proportion 0.589 0.217 0.127 0.067

4
## --------------------------------------------------------------------------------
## thal
## n missing distinct
## 301 2 3
##
## Value 3 6 7
## Frequency 166 18 117
## Proportion 0.551 0.060 0.389
## --------------------------------------------------------------------------------
## heart_disease_severity
## n missing distinct Info Mean Gmd
## 303 0 5 0.832 0.9373 1.25
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 164 55 36 35 13
## Proportion 0.541 0.182 0.119 0.116 0.043
## --------------------------------------------------------------------------------
## exter_angina
## n missing distinct
## 303 0 2
##
## Value 0 1
## Frequency 204 99
## Proportion 0.673 0.327
## --------------------------------------------------------------------------------
## has_heart_disease
## n missing distinct
## 303 0 2
##
## Value no yes
## Frequency 164 139
## Proportion 0.541 0.459
## --------------------------------------------------------------------------------
profiling_num(heart_disease) # Análisis para variables numéricas (cuantitativas)

## variable mean std_dev variation_coef p_01 p_05


## 1 age 54.4389439 9.0386624 0.1660330 35.00 40.0
## 2 resting_blood_pressure 131.6897690 17.5997477 0.1336455 100.00 108.0
## 3 serum_cholestoral 246.6930693 51.7769175 0.2098840 149.00 175.1
## 4 max_heart_rate 149.6072607 22.8750033 0.1529004 95.02 108.1
## 5 exer_angina 0.3267327 0.4697945 1.4378558 0.00 0.0
## 6 oldpeak 1.0396040 1.1610750 1.1168436 0.00 0.0
## 7 slope 1.6006601 0.6162261 0.3849825 1.00 1.0
## 8 num_vessels_flour 0.6722408 0.9374383 1.3944978 0.00 0.0
## 9 heart_disease_severity 0.9372937 1.2285357 1.3107265 0.00 0.0
## p_25 p_50 p_75 p_95 p_99 skewness kurtosis iqr range_98
## 1 48.0 56.0 61.0 68.0 71.00 -0.2080241 2.465477 13.0 [35, 71]
## 2 120.0 130.0 140.0 160.0 180.00 0.7025346 3.845881 20.0 [100, 180]
## 3 211.0 241.0 275.0 326.9 406.74 1.1298741 7.398208 64.0 [149, 406.74]
## 4 133.5 153.0 166.0 181.9 191.96 -0.5347844 2.927602 32.5 [95.02, 191.96]
## 5 0.0 0.0 1.0 1.0 1.00 0.7388506 1.545900 1.0 [0, 1]
## 6 0.0 0.8 1.6 3.4 4.20 1.2634255 4.530193 1.6 [0, 4.2]

5
## 7 1.0 2.0 2.0 3.0 3.00 0.5057957 2.363050 1.0 [1, 3]
## 8 0.0 0.0 1.0 3.0 3.00 1.1833771 3.234941 1.0 [0, 3]
## 9 0.0 0.0 2.0 3.0 4.00 1.0532483 2.843788 2.0 [0, 4]
## range_80
## 1 [42, 66]
## 2 [110, 152]
## 3 [188.8, 308.8]
## 4 [116, 176.6]
## 5 [0, 1]
## 6 [0, 2.8]
## 7 [1, 2]
## 8 [0, 2]
## 9 [0, 3]

Para variables categoricas


freq(heart_disease) # Análisis categórico (cuantitativo y gráfico)

male 206 (67.99%)


gender

female 97 (32.01%)

Frequency / (Percentage %)

## gender frequency percentage cumulative_perc


## 1 male 206 67.99 67.99
## 2 female 97 32.01 100.00

6
4 144 (47.52%)

3 86 (28.38%)
chest_pain

2 50 (16.5%)

1 23 (7.59%)

Frequency / (Percentage %)

## chest_pain frequency percentage cumulative_perc


## 1 4 144 47.52 47.52
## 2 3 86 28.38 75.90
## 3 2 50 16.50 92.40
## 4 1 23 7.59 100.00

7
0 258 (85.15%)
fasting_blood_sugar

1 45 (14.85%)

Frequency / (Percentage %)

## fasting_blood_sugar frequency percentage cumulative_perc


## 1 0 258 85.15 85.15
## 2 1 45 14.85 100.00

8
0 151 (49.83%)
resting_electro

2 148 (48.84%)

1 4 (1.32%)

Frequency / (Percentage %)

## resting_electro frequency percentage cumulative_perc


## 1 0 151 49.83 49.83
## 2 2 148 48.84 98.67
## 3 1 4 1.32 100.00

9
NA 2 (0.66%)

3 166 (54.79%)
thal

7 117 (38.61%)

6 18 (5.94%)

Frequency / (Percentage %)

## thal frequency percentage cumulative_perc


## 1 3 166 54.79 54.79
## 2 7 117 38.61 93.40
## 3 6 18 5.94 99.34
## 4 <NA> 2 0.66 100.00

10
0 204 (67.33%)
exter_angina

1 99 (32.67%)

Frequency / (Percentage %)

## exter_angina frequency percentage cumulative_perc


## 1 0 204 67.33 67.33
## 2 1 99 32.67 100.00

11
no 164 (54.13%)
has_heart_disease

yes 139 (45.87%)

Frequency / (Percentage %)

## has_heart_disease frequency percentage cumulative_perc


## 1 no 164 54.13 54.13
## 2 yes 139 45.87 100.00
## [1] "Variables processed: gender, chest_pain, fasting_blood_sugar, resting_electro, thal, exter_angin
freq(heart_disease$chest_pain)

12
4 144 (47.52%)

3 86 (28.38%)
var

2 50 (16.5%)

1 23 (7.59%)

Frequency / (Percentage %)

## var frequency percentage cumulative_perc


## 1 4 144 47.52 47.52
## 2 3 86 28.38 75.90
## 3 2 50 16.50 92.40
## 4 1 23 7.59 100.00
tbl=freq(heart_disease$chest_pain,path_out="graficos")
tbl

## var frequency percentage cumulative_perc


## 1 4 144 47.52 47.52
## 2 3 86 28.38 75.90
## 3 2 50 16.50 92.40
## 4 1 23 7.59 100.00

Medidas de Tendencia Central

13

You might also like