#30diasdegraficos Parte 3 – Data, Code and Visualization

Introducción

Seguiremos utilizando los mismos paquetes que la parte anterior.

Code

# ejecutar estas líneas para poder instalar {datos}
# install.packages("remotes")
# remotes::install_github("cienciadedatos/datos")

library(datos)       # datos
library(highcharter) # gráficos
library(ggplot2)     # más gráficos  
library(readr)       # lectura de datos
library(dplyr)       # manipulación de datos
library(tidyr)       # más manipulación de datos

Cambiando configuración para el español.

Code

newlang_opts <- getOption("highcharter.lang")

f <- Sys.Date()
dias <- weekdays((f - lubridate::days(lubridate::wday(f) - 1)) + lubridate::days(0:6))

newlang_opts$weekdays <- dias
newlang_opts$months <- as.character(lubridate::month(1:12, label = TRUE, abbr = FALSE))
newlang_opts$shortMonths <- as.character(lubridate::month(1:12, label = TRUE, abbr = TRUE))
newlang_opts$thousandsSep <- ","

options(highcharter.lang = newlang_opts)

Día 21: gráficos con anotaciones

Realizaremos el ejemplo canónico en HighcharsJS

Code

tourfrance <- read_csv(here::here("posts/2020-06-02-30diasdegraficos-parte-3/data/tour_france_state_8.txt"))

hc21 <- hchart(tourfrance, "area", hcaes(distance, elevation), fillOpacity = 0.25) %>% 
  hc_title(text = "Tour de Francia 2017, Etapa 8: <i>Dole - Station des Rousses</i>") %>% 
  hc_subtitle(text = "Ejemplo obtendido de la documentación de HighchartsJS") %>% 
  hc_xAxis(labels = list(format = "{value} km"), title  = list(text = "Distancia")) %>% 
  hc_yAxis(labels = list(format = "{value} m"), title = list(text = "Elevación")) %>% 
  hc_tooltip(
    headerFormat = "Distance: {point.x:.1f} km<br>",
    pointFormat = "{point.y} m a. s. l."
    )
   
hc21

Code

df1 <- read_csv('"x","y", "text"
27.98,255, "Arbois"
45.5,611,"Montrond"
63,651,"Mont-sur-Monnet"
84,789,"Bonlieu"
129.5,382,"Chassal"
159,443,"Saint-Claude"')

df2 <- read_csv('"x","y","text"
101.44,1026,"Col de la Joux"
138.5,748,"Côte de Viry"
176.4,1202,"Montée de la Combe<br>de Laisia Les Molunes"')

df3 <- read_csv('"x","y","text"
96.2,783,"6.1 km climb<br>4.6% on avg."
134.5,540,"7.6 km climb<br>5.2% on avg."
172.2,925,"11.7 km climb<br>6.4% on avg."
')

df_to_annotations_labels <- function(df, xAxis = 0, yAxis = 0) {
  
  stopifnot(hasName(df, "x"))
  stopifnot(hasName(df, "y"))
  stopifnot(hasName(df, "text"))
  
  df %>% 
    rowwise() %>% 
    mutate(point = list(list(x = x, y = y, xAxis = 0, yAxis = 0))) %>% 
    select(-x, -y)  
  
}

df1_p <- df_to_annotations_labels(df1)

df2_p <- df_to_annotations_labels(df2)

df3_p <- df_to_annotations_labels(df3)
  
hc21 %>% 
  hc_annotations(
    list(
      labelOptions = list(backgroundColor = 'rgba(255,255,255,0.5)', verticalAlign = "top", y = 15),
      labels = list_parse(df1_p)
      ),
    list(
      labels = list_parse(df2_p)
      ),
    list(
      labelOptions = list(
        shape = "connector",
        align = "right",
        justify = FALSE,
        crop = TRUE,
        style = list(fontSize = "0.8em", textOutline = "1px white")
        ),
      labels = list_parse(df3_p)
      )
    ) %>% 
  hc_caption(
    text = "Este gráfico utiliza la función Anotaciones de Highcharts para colocar
    etiquetas en varios puntos de interés. Las etiquetas son <i>responsivas</i> y se ocultarán
    para evitar la superposición en pantallas pequeñas."
  )

Día 22: datos textuales

Code

library(stringr)
library(rvest)

read_html("https://github.com/cienciadedatos/r4ds") %>%
  html_nodes("a.js-navigation-open") %>%
  html_text() %>%
  str_subset("[0-9]{2}.*.qmd") %>%
  dput()

character(0)

Code

secciones <- c(
  "01-intro.qmd", "02-explore.qmd", "03-visualize.qmd", "04-workflow-basics.qmd", 
  "05-transform.qmd", "06-workflow-scripts.qmd", "07-eda.qmd", 
  "08-workflow-projects.qmd", "09-wrangle.qmd", "10-tibble.qmd", 
  "11-import.qmd", "12-tidy.qmd", "13-relational-data.qmd", "14-strings.qmd", 
  "15-factors.qmd", "16-datetimes.qmd", "17-program.qmd", "18-pipes.qmd", 
  "19-functions.qmd", "20-vectors.qmd", "21-iteration.qmd", "22-model.qmd", 
  "23-model-basics.qmd", "24-model-building.qmd", "25-model-many.qmd", 
  "26-communicate.qmd", "27-rmarkdown.qmd", "28-communicate-plots.qmd", 
  "29-rmarkdown-formats.qmd", "30-rmarkdown-workflow.qmd")

ruta_base <- "https://raw.githubusercontent.com/cienciadedatos/r4ds/traduccion/"

r4ds <- purrr::map_df(secciones, function(seccion = "23-model-basics.qmd"){
  
  message(paste0(ruta_base, seccion))
  
  lineas <- read_lines(paste0(ruta_base, seccion))
  
  data_frame(
    seccion = seccion,
    texto  = lineas
  )
  
})

r4ds <- r4ds %>% 
  mutate(
    seccion_num = as.numeric(str_extract(seccion, "[0-9]{2}")),
    capitulo = case_when(
      seccion_num <=  1 ~ "1. Bienvenida",
      seccion_num <=  8 ~ "2. Explorar",
      seccion_num <= 16 ~ "3. Manejar datos",
      seccion_num <= 21 ~ "4. Programar",
      seccion_num <= 25 ~ "5. Modelar",
      seccion_num <= 30 ~ "6. Comunicar",
    )
  )

Code

library(tidytext)

r4ds2 <- r4ds %>%
  unnest_tokens(palabra, texto) %>% 
  mutate(
    palabra = str_to_lower(palabra),
    palabra = str_remove_all(palabra, "_"),
    palabra = str_remove_all(palabra, "[0-9]+"),
    palabra = str_remove_all(palabra, "[:punct:]+"),
    palabra = str_trim(palabra)
    ) %>% 
  filter(palabra != "") %>% 
  anti_join(tibble(palabra = stopwords::stopwords(language = "es")), by = "palabra")

r4ds2 <- r4ds2 %>%
  count(capitulo, palabra, sort = TRUE)

total_r4ds2 <- r4ds2 %>% 
  group_by(capitulo) %>% 
  summarize(total = sum(n))

r4ds2 <- left_join(r4ds2, total_r4ds2)

r4ds2 <- r4ds2 %>%
  bind_tf_idf(palabra, capitulo, n)

r4ds2

# A tibble: 14,467 × 7
   capitulo         palabra     n total     tf   idf tf_idf
   <chr>            <chr>   <int> <int>  <dbl> <dbl>  <dbl>
 1 3. Manejar datos r         519 15182 0.0342     0      0
 2 4. Programar     x         323 10554 0.0306     0      0
 3 2. Explorar      r         255 12504 0.0204     0      0
 4 4. Programar     r         254 10554 0.0241     0      0
 5 3. Manejar datos datos     210 15182 0.0138     0      0
 6 2. Explorar      x         203 12504 0.0162     0      0
 7 3. Manejar datos x         196 15182 0.0129     0      0
 8 3. Manejar datos n         187 15182 0.0123     0      0
 9 5. Modelar       x         183  7919 0.0231     0      0
10 6. Comunicar     r         180  7113 0.0253     0      0
# ℹ 14,457 more rows

Code

r4ds2_top <- r4ds2 %>%
  arrange(desc(tf_idf)) %>%
  mutate(palabra = factor(palabra, levels = rev(unique(palabra)))) %>% 
  group_by(capitulo) %>% 
  top_n(10) %>% 
  ungroup()
  
ggplot(r4ds2_top, aes(palabra, tf_idf, fill = capitulo)) +
  geom_col(show.legend = FALSE, width = 0.5) +
  labs(x = NULL, y = "tf-idf") +
  scale_fill_viridis_d(option = "B", begin = 0.1, end = 0.9) +
  facet_wrap(vars(capitulo), ncol = 3, scales = "free") +
  coord_flip()

Code

r4ds2_top50 <- r4ds2 %>%
  arrange(desc(tf_idf)) %>%
  mutate(palabra = factor(palabra, levels = rev(unique(palabra)))) %>% 
  group_by(capitulo) %>% 
  top_n(50) %>% 
  ungroup()

r4ds2_top50 <- r4ds2_top50 %>% 
  arrange(desc(tf_idf)) %>% 
  add_row(capitulo = "", palabra = "R4DS", tf_idf = max(r4ds2_top50$tf_idf)* 2)

fntfmly <- '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"' 

hchart(
  r4ds2_top50, 
  "wordcloud",
  hcaes(name = palabra, weight = tf_idf, color = capitulo),
  style  = list(fontFamily = fntfmly, fontWeight = "bold"),
  tooltip = list(
    pointHeader = "<b>{point.key}</b>",
    pointFormat = "Capítulo <b>{point.capitulo}</b><br>TF-IDF: {point.tf_idf:0.4f}"
    )
  )

Día 23: proyección solar - sunburst

Code

# install.packages("treemap")
library(treemap)

paises_2007 <- datos::paises %>% 
  filter(anio == max(anio)) %>% 
  mutate(poblacion = round(poblacion/1e6))

data_to_hierarchical_series <- function(data, group_vars, size_var) {
  
  # data <- paises_2007
  # group_vars <- c("continente", "pais")
  # size_var <- "poblacion"
  
  group_syms <- rlang::syms(group_vars)
  size_sym <- rlang::sym(size_var)
  
  if (data %>%
      select(!!!group_syms) %>%
      mutate_all(as.character) %>%
      purrr::map(unique) %>%
      unlist() %>%
      anyDuplicated()) stop("Sunburst data uses same label at multiple levels.")
  
  data <- data %>%
    mutate_at(vars(all_of(group_vars)), as.character)
  
  name_cell <- function(..., depth) paste0(list(...), 1:depth, collapse = "")
  
  data_at_depth <- function(depth = 1) {
    
    data %>%
      group_by(!!!group_syms[1:depth]) %>%
      summarise(value = sum(!!size_sym)) %>%
      ungroup() %>%
      arrange(desc(value)) %>% 
      mutate(name = !!group_syms[[depth]],
             level = depth) %>%
      # mutate_at(group_vars, as.character()) %>%
      {
        if (depth == 1)
          mutate(., id = paste0(name, 1))
        else {
          mutate(
            .,
            parent = purrr::pmap_chr(list(!!!group_syms[1:depth - 1]),
                                     name_cell,
                                     depth = depth - 1),
            id = paste0(parent, name, depth)
          )
        }
      }
  }
  
  sunburst_df <- 1:length(group_vars) %>%
    purrr::map(data_at_depth) %>%
    bind_rows() %>%
    arrange(level)
  
  data_list <- sunburst_df %>%
    highcharter::list_parse() # %>% purrr::map( ~ .[!is.na(.)])
  
  data_list
  
}


dataserie <- data_to_hierarchical_series(
  paises_2007,
  group_vars = c("continente", "pais"),
  size_var = "poblacion"
  )

highchart() %>%
  hc_add_series(
    data = dataserie,
    type = "sunburst",
    # type=  "treemap",
    allowDrillToNode = TRUE,
    levels = list(
      list(
        level = 1,
        borderWidth = 0,
        borderColor = "transparent",
        colorByPoint = TRUE,
        dataLabels = list(enabled = TRUE)
        ),
      list(
        level = 2,
        borderWidth = 0,
        borderColor = "transparent",
        colorVariation = list(key = "brightness", to = 0.50),
        dataLabels = list(enabled = TRUE)
        )
      )
    )

Code

paises_2007 <- paises_2007 %>% 
  mutate(mundo = "Mundo")

dataserie <- data_to_hierarchical_series(
  paises_2007,
  group_vars = c("mundo", "continente", "pais"),
  size_var = "poblacion"
  )

highchart() %>%
  hc_add_series(
    data = dataserie,
    type = "sunburst",
    name = "Población",
    # type=  "treemap",
    allowDrillToNode = TRUE,
    tooltip = list(
      headerFormat = "<b>{point.key}</b><br>",
      pointFormat = "{point.value} millones habitantes"
    ),
    levels = list(
       list(
        level = 1,
        borderWidth = 0,
        color = "transparent",
        borderColor = "transparent",
        # colorByPoint = TRUE,
        dataLabels = list(enabled = TRUE)
        ),
      list(
        level = 2,
        borderWidth = 0,
        borderColor = "transparent",
        colorByPoint = TRUE,
        dataLabels = list(enabled = TRUE)
        ),
      list(
        level = 3,
        borderWidth = 0,
        borderColor = "transparent",
        colorVariation = list(key = "brightness", to = 0.50),
        dataLabels = list(enabled = TRUE)
        )
      )
    )

Día 24: coropleta (ya lo hicimos antes?)

Como en el día de datos espaciales realizamos una coropleta, esta realizaremos un diagrama de puntos sobre un mapa!

Code

aeropuertos <- read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat", col_names = FALSE)

aeropuertos  <- aeropuertos  %>%
  filter(X4 == "Chile") %>% 
  select(nombre = X2, lon = X8, lat = X7) %>% 
  filter(lat <= 0, lon >= -90)

hc24 <- hcmap("countries/cl/cl-all", showInLegend = FALSE) %>% 
  hc_add_series(
    data = aeropuertos, 
    type = "mappoint",
    name = "Aeropuertos de Chile",
    tooltip = list(pointFormat = "{point.nombre} ({point.lat:0.2f}, {point.lon:0.2f})")
    ) 

hc24

Día 25: violín

Code

library(ggplot2)

p25 <- ggplot(datos::flores, aes(Largo.Sepalo, Especie)) +
  geom_violin()

dflores <- datos::flores %>% 
  distinct(Especie) %>% 
  mutate(y = as.numeric(Especie))

d25 <- as_tibble(layer_data(p25, 1)) %>% 
  select(x, y, violinwidth, width, ndensity) %>% 
  mutate_all(round, 3) %>% 
  mutate(y = as.numeric(y)) %>% 
  left_join(dflores, by = "y")

d25 <- d25 %>% 
  filter(row_number() %% 2 == 0)

hchart(d25, "arearange", hcaes(x, low = y - violinwidth*width  - 1, high = y + violinwidth*width  - 1, group = Especie)) %>% 
  hc_yAxis(
    categories = dflores$Especie, 
    type = "categorical", 
    endOnTick = FALSE,
    startOnTick = FALSE,
    title = list(text = "Especie")
    ) %>% 
  hc_xAxis(
    title = list(text = "Largo del Sépalo")
    ) %>% 
  hc_tooltip(
    useHTML = TRUE,
    pointFormat = "<span style='color:{point.color};'>&#9679;</span> {series.name}: <b>{point.ndensity:,.4f}</b><br/>"
  )

Fin

Lamentablemente el tiempo (quizás la ganas!) dictaron a que no pudiese completar todos los días del challenge. ¿Fail? Para nada! Aprendí mucho de highcharts, volví a escribir en el blog <3 e hice post en español.

Reuse

CC BY 4.0

Citation

BibTeX citation:

@online{kunst_fuentes2020,
  author = {Kunst Fuentes, Joshua},
  title = {\#30diasdegraficos {Parte} 3},
  date = {2020-06-02},
  url = {https://jkunst.com/blog/posts/2020-06-02-30diasdegraficos-parte-3/},
  langid = {en}
}

For attribution, please cite this work as:

Kunst Fuentes, Joshua. 2020. “#30diasdegraficos Parte 3.” June 2, 2020. https://jkunst.com/blog/posts/2020-06-02-30diasdegraficos-parte-3/.