library(tidyverse)
library(ggforce)
library(gapminder)
library(ggridges)
library(PASWR)
library(tinter)
library(fmsb)
library(ggalt)
library(cowplot)
library(RColorBrewer)
library(Stat2Data)
library(colorspace)
library(reshape2)
library(devtools)
library(waffle)
library(treemapify)
library(zoo)
library(wordcloud2)
library(tm) #text mining
library(scales)

text_color = "#353d42"
caption_color = "#666666"
font = "Avenir Next"

1 Visualize amount

1.1 Dot plot/lollipop chart

1.1.1 One group

gapminder_dot <- gapminder %>% 
  filter(year == 2007, continent == "Americas") %>%
  dplyr::select(country, lifeExp) %>% 
  mutate(country = fct_reorder(country, lifeExp),
         mean_life = mean(lifeExp),
         flag  = ifelse(lifeExp - mean_life > 0, TRUE, FALSE))

ggplot(gapminder_dot, aes(x = lifeExp, y = country)) +
  geom_segment(
    aes(x = 60, xend = lifeExp, y = country, yend = country),
    color = "#0072B2",
    size = 0.5,
    alpha = 0.3
  ) +
  geom_point(
    color = "#0072B2", 
    size = 3) +
  scale_x_continuous(
    name = "life expectancy (years)",
    expand = c(0, 0), 
    limits = c(59.7, 81.5)) +
  scale_y_discrete(
    name = NULL, 
    expand = c(0, 0.5)) +
  labs(
    caption = "Source: gapminder",
    title = "Life expectancy of American countries in 2007") +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.1),
    plot.margin = margin(18, 6, 3, 1.5),
    axis.title.x = element_text(family = font, size = 11, color = text_color, margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2))

1.1.2 Change baseline

ggplot(gapminder_dot, aes(x = lifeExp, y = country, color = flag)) +
  geom_segment(
    aes(x = mean_life , xend = lifeExp, y = country, yend = country),
    size = 0.5,
    alpha = 0.3
  ) +
  geom_point(
    size = 3) +
  ggplot2::annotate("text", x = 81, y = 14.5, 
           label = "Above average", 
           family = font,
           color = text_color, 
           size = 3) +
  ggplot2::annotate("text", x = 69, y = 10, 
           label = "Below average",
           family = font,
           color = text_color, 
           size = 3) +
  geom_curve(aes(x = 69, xend = 71, y = 9, yend = 7),
             color = text_color,
             size = 0.1,
             arrow = arrow(length = unit(0.01, "npc"))) +
  geom_curve(aes(x = 81, xend = 79, y = 15, yend = 17),
             color = text_color,
             size = 0.1,
             arrow = arrow(length = unit(0.01, "npc"))) +
  scale_x_continuous(
    name = "life expectancy (years)",
    expand = c(0, 0), 
    limits = c(59.7, 85)) +
  scale_y_discrete(
    name = NULL, 
    expand = c(0, 0.5)) +
   scale_color_manual(
    values = c("#D55E00", "#0072B2")
  ) +
  labs(
    caption = "Source: gapminder",
    title = "Life expectancy of American countries in 2007") +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.1),
    plot.margin = margin(18, 6, 3, 1.5),
    axis.title.x = element_text(family = font, size = 11, color = text_color, margin = margin (t = 10)),
    legend.position = "none",
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2))

1.2 Dumbbell chart

gapminder_dot_1 <- gapminder %>% 
  filter(year %in% c(1987, 2007), continent == "Americas") %>%
  dplyr::select(year, country, lifeExp) %>% 
  mutate(country = fct_reorder(country, lifeExp)) %>% 
  spread(year, lifeExp)

ggplot(gapminder_dot_1) +
  geom_segment(
    aes(x = `1987`, xend = `2007`, y = country, yend = country),
    color = "black",
    alpha = 0.3
  ) +
  geom_point(
    aes(x = `1987`, y = country),
    color = "#D55E00", 
    size = 2) +
  geom_point(
    aes(x = `2007`, y = country),
    color = "#0072B2", 
    size = 2) +
  scale_x_continuous(
    name = "life expectancy (years)",
    expand = c(0, 0), 
    limits = c(52, 81.5)) +
  scale_y_discrete(
    name = NULL, 
    expand = c(0, 0.5)) +
  labs(
    caption = "Source: gapminder",
    title = "Life expectancy of American countries 1987 - 2007") +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.1),
    plot.margin = margin(18, 6, 3, 1.5),
    axis.title.x = element_text(family = font, size = 11, color = text_color, margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2))

# Another method use geom_dumbbell from library ggalt

dumbbell_df <- gapminder %>%
  filter(year == 1967 | year == 2007) %>%
  dplyr::select(country, year, lifeExp) %>%
  spread(year, lifeExp) %>%
  mutate(gap = `2007` - `1967`) %>%
  arrange(desc(gap)) %>%
  head(10)

#Make plot
method2 <- ggplot(dumbbell_df, aes(x = `1967`, xend = `2007`, y = reorder(country, gap), group = country)) + 
  geom_dumbbell(colour = "#dddddd",
                size = 3,
                colour_x = "#FAAB18",
                colour_xend = "#1380A1") +
  labs(title="Life expectancy rise in 10 countries, 1967-2007 ",
       subtitle="We are living longer",
       caption = "Source: gapminder") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#d9dbda", size = 0.4),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    axis.title = element_blank(),
    plot.title = element_text(family = font, size = 15, color = text_color, face = "bold"),
    plot.subtitle = element_text(family = font, size = 11, color = text_color),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0, margin = margin(t = 10))
  )

1.3 Radar chart

1.3.1 One observation

exam_scores <- data.frame(
    row.names = c("Student.1", "Student.2", "Student.3"),
      Biology = c(7.9, 3.9, 9.4),
      Physics = c(10, 20, 0),
        Maths = c(3.7, 11.5, 2.5),
        Sport = c(8.7, 20, 4),
      English = c(7.9, 7.2, 12.4),
    Geography = c(6.4, 10.5, 6.5),
          Art = c(2.4, 0.2, 9.8),
  Programming = c(0, 0, 20),
        Music = c(20, 20, 20)
)
# To use the fmsb package, the data should be organized as follow:
# - row 1 contains maximum values for each variable
# - row 2 contains minimum values for each variable
# - data for cases or individuals start from row 3
# - number of columns must be more than 2

max_min <- data.frame(
  Biology = c(20, 0), Physics = c(20, 0), Maths = c(20, 0),
  Sport = c(20, 0), English = c(20, 0), Geography = c(20, 0),
  Art = c(20, 0), Programming = c(20, 0), Music = c(20, 0)
)
rownames(max_min) <- c("Max", "Min")

# Bind the variable ranges to the data
df <- rbind(max_min, exam_scores)

# rada chart of student 1
student1_data <- df[c("Max", "Min", "Student.1"), ]

radarchart(student1_data,
           axistype = 1,
           # Customize the polygon
           pcol = "#1b9e77", #color of the line
           pfcol = scales::alpha("#1b9e77", 0.5), # filling color
           plwd = 1.5, # line width
           # Customize the grid
           cglcol = "#cbcbcb", #color
           cglwd = 0.9, #line width
           cglty = 3,
           axislabcol = "#cbcbcb",
           title = "Student A score report",
           vlabels = colnames(student1_data),
           vlcex = 0.7,
           caxislabels = c(0, 5, 10, 15, 20)
           )

1.3.2 Multiple observations 1

# rada chart of student 1
radarchart(df,
           axistype = 1,
           # Customize the polygon
           pcol = c("#e41a1c", "#377eb8", "#4daf4a"), #color of the line
           pfcol = scales::alpha(c("#e41a1c", "#377eb8", "#4daf4a"), 0.6), # filling color
           plwd = c(1.5, 1.5, 1.5), # line width
           plty = c(1, 1, 1),
           # Customize the grid
           cglcol = "#cbcbcb", #color
           cglwd = 0.9, #line width
           cglty = 3,
           axislabcol = "#cbcbcb",
           title = "Score report of three students ",
           vlabels = colnames(student1_data),
           vlcex = 0.7,
           caxislabels = c(0, 5, 10, 15, 20)
           )

1.3.3 Multiple observations 2

# Define colors and titles
colors <- c("#00AFBB", "#E7B800", "#FC4E07")
titles <- c("Student A", "Student B", "Student C")

# Reduce plot margin using par()
# Split the screen in 3 parts
par(mfrow = c(1,3))

# Create the radar chart
for(i in 1:3){
  radarchart(df[c(1, 2, i+2), ],
           axistype = 1,
           # Customize the polygon
           pcol = colors[i], #color of the line
           pfcol = scales::alpha(colors[i], 0.5), # filling color
           plwd = 1.5, # line width
           # Customize the grid
           cglcol = "#cbcbcb", #color
           cglwd = 0.9, #line width
           cglty = 3,
           axislabcol = "#cbcbcb",
           title = titles[i],
           vlabels = colnames(df[c(1, 2, i+2), ]),
           vlcex = 0.7,
           caxislabels = c(0, 5, 10, 15, 20)
           )
}

### Compared with the average

set.seed(123)
df <- as.data.frame(
  matrix(sample(2:20 , 90 , replace = TRUE),
         ncol=9, byrow = TRUE)
  )
colnames(df) <- c(
  "Biology", "Physics", "Maths", "Sport", "English", 
  "Geography", "Art", "Programming", "Music"
  )
rownames(df) <- paste0("Student.", 1:nrow(df))

# Rescale each variable to range between 0 and 1
df_scaled <- round(apply(df, 2, scales::rescale), 2)
df_scaled <- as.data.frame(df_scaled)

#Prepare data
# Variables summary
# Get the minimum and the max of every column  
col_max <- apply(df_scaled, 2, max)
col_min <- apply(df_scaled, 2, min)
# Calculate the average profile 
col_mean <- apply(df_scaled, 2, mean)
# Put together the summary of columns
col_summary <- t(data.frame(Max = col_max, Min = col_min, Average = col_mean))


# Bind variables summary to the data
df_scaled2 <- as.data.frame(rbind(col_summary, df_scaled))

opar <- par() 
# Define settings for plotting in a 3x4 grid, with appropriate margins:
par(mar = rep(0.8,4))
par(mfrow = c(3,4))
# Produce a radar-chart for each student
for (i in 4:nrow(df_scaled2)) {
  radarchart(
    df_scaled2[c(1:3, i), ],
    pfcol = c("#99999980",NA),
    pcol= c(NA,2), plty = 1, plwd = 2,
    title = row.names(df_scaled2)[i]
  )
}
# Restore the standard par() settings
par <- par(opar)

1.4 Bar plot

1.4.1 Vertical bars

boxoffice <- data.frame(rank = 1:5,
                title = c("Star Wars: The Last Jedi", "Jumanji: Welcome to the Jungle", "Pitch Perfect 3", "The Greatest Showman", "Ferdinand"),
                title_short = c("Star Wars", "Jumanji", "Pitch Perfect 3", "Greatest Showman", "Ferdinand"),
                amount = c(71565498, 36169328, 19928525, 8805843, 7316746),
                amount_text = c("$71,565,498", "$36,169,328", "$19,928,525", "$8,805,843", "$7,316,746"))

ggplot(boxoffice, aes(x = fct_reorder(title_short, rank), y = amount)) +
    geom_col(
      fill = "#56B4E9", 
      width = 0.6, 
      alpha = 0.9)+
    scale_y_continuous(
      expand = c(0,0),
      breaks = c(0, 2e7, 4e7, 6e7),
      labels = c("0", "20", "40", "60"),
      name = "Weekend gross (million USD)") +
    scale_x_discrete(
      name = NULL,
      expand = c(0, 0.4)) +
    coord_cartesian(clip = "off") +
    labs(
      caption = "Source: Box Office Mojo",
      title = "Highest grossing movies for 22-24.12.2017") +
    theme(
      axis.ticks = element_blank(),
      axis.text = element_text(family = font, color = text_color, size = 11),
      axis.title.y = element_text(family = font, color = text_color, size = 11, margin = margin(r = 20)),
      panel.background = element_blank(),
      panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
      plot.title = element_text(family = font, size = 15, color = text_color, face = "bold", margin = margin(b = 10), hjust = 0),
      plot.caption = element_text(family = font, size = 10, color = caption_color, margin = margin(t = 15), hjust = 0)
    )

1.4.2 Horizontal bars

ggplot(boxoffice, aes(x = fct_reorder(title_short, desc(rank)), y = amount)) +
    geom_col(
      fill = "#56B4E9", 
      alpha = 0.9)+
    scale_y_continuous(
      expand = c(0,0),
      breaks = c(0, 2e7, 4e7, 6e7),
      labels = c("0", "20", "40", "60"),
      name = "Weekend gross (million USD)") +
    scale_x_discrete(name = NULL) +
    coord_cartesian(clip = "off") +
    labs(
      title = "Highest grossing movies for 22-24.12.2017",
      caption = "Source: Box Office Mojo")  +
    coord_flip() +
    theme(
      axis.ticks = element_blank(),
      axis.text = element_text(family = font, color = text_color, size = 11),
      axis.text.x = element_text(vjust = 1, hjust = 1),
      axis.title = element_text(family = font, color = text_color, size = 11),
      panel.background = element_blank(),
      panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
      plot.title = element_text(family = font, size = 15, color = text_color, face = "bold", margin = margin(b = 10), vjust = -2),
      plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0)    
      )

1.4.3 Grouped bars

diamonds_by_color_cut <- 
  diamonds %>% 
  group_by(color, cut) %>% 
  summarize(average_price = mean(price))

color_five = RColorBrewer::brewer.pal(n = 9, "BuGn")[4:8] #Get 5 color from the scale
#Focus on price differences between each color group
ggplot(diamonds_by_color_cut, aes(x = color, y = average_price, fill = cut)) +
  geom_col(position = "dodge", alpha = 0.9) +
  scale_y_continuous(
    expand = c(0, 0),
    name = "Average price (USD)",
    breaks = c(1000, 2000, 3000, 4000, 5000, 6000),
    labels = c("1,000", "2,000","3,000","4,000","5,000","6,000")) + 
  scale_fill_manual(values = color_five, name = NULL) + 
  coord_cartesian(clip = "off") +
  labs(
    title = "Prices of 50,000 diamonds versus cut quality and color",
    caption = "Source: diamonds - ggplot2") +
  xlab("Color") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.title = element_text(family = font, color = text_color, size = 11),
    axis.title.x = element_text(margin = margin(t = 8)),
    axis.title.y = element_text(margin = margin(r = 8)),
    plot.caption = element_text(family = font, size = 10, color = text_color, hjust = 0),
    plot.title = element_text(family = font, size = 15, color = text_color, face = "bold"),
    legend.text = element_text(family = font, color = text_color, size = 8)
    )

1.4.4 Stacked Bars

titanic = read.csv("/Users/huvi/Desktop/datasets/titanic.csv")
titanic_stacked <- titanic %>% 
  group_by(Pclass, Sex) %>% 
  tally() %>% 
  arrange(Pclass, desc(Sex))%>% 
  mutate(Sex = factor(Sex, levels = c("female", "male"))) %>% 
  group_by(Pclass) %>% 
  mutate(nlabel = cumsum(n) - n/2, Pclass = paste(Pclass, "class"))

ggplot(titanic_stacked, aes(x = Pclass, y = n, fill = Sex)) +
  geom_col(position = "stack", color = "white", size = 1, width = 1) +
  geom_text(aes(y = nlabel, label = n), color = "white", size = 6, family = font) +
  labs( title = "Numbers of passengers on the Titanic based on gender and class",
        caption = "Source: Encyclopedia Titanica") +
  scale_x_discrete(expand = c(0,0), name = NULL) +
  scale_y_continuous(expand = c(0,0), name = NULL, breaks = NULL) +
  coord_cartesian(clip = "off") +
  scale_fill_manual(values = c("#D55E00", "#0072B2"),
                    breaks = c("female", "male"),
                    labels = c("male passengers", "female passengers"),
                    name = NULL) +
  theme(panel.background = element_blank(),
        axis.text = element_text(family = font, color = text_color, size = 11),
        axis.text.x = element_text(margin = margin(t = 15)),
        legend.text = element_text(family = font, color = text_color, size = 11),
        legend.position = "bottom",
        legend.justification = "center",
        legend.spacing.x = grid::unit(7, "pt"),
        legend.spacing.y = grid::unit(0, "cm"),
        plot.title = element_text(family = font, size = 15, color = text_color, face = "bold", margin = margin(b = 30)),
        plot.caption = element_text(family = font, size = 10, color = text_color, hjust = 0))

1.5 Heatmap

country_list = c("United States", "China", "India", "Japan", "Vietnam", "Brazil", "Germany", "France", "United Kingdom", "Italy", "New Zealand", "Canada", "Mexico", "Chile", "Argentina", "Norway", "South Africa", "Myanmar", "Israel", "Iceland")

internet <- read.csv("/Users/huvi/Desktop/datasets/Individuals.using.the.Internet.%.population.World.Bank.csv")

internet_short <- internet %>% 
  dplyr::select(Country = Country.Name, Time, Users = Individuals.using.the.Internet....of.population...IT.NET.USER.ZS.) %>% 
  filter(Country %in% country_list) %>% 
  mutate(Users = ifelse(is.na(Users), 0, Users))

internet_summary <- internet_short %>% 
  group_by(Country) %>% 
  summarize(Time1 = min(Time[Users > 0]),
            Last = Users[n()]) %>% 
  arrange(Last, desc(Time1))
         
internet_short <- internet_short %>% 
  mutate(Country = factor(Country, levels = internet_summary$Country),
         Users = as.numeric(Users))

ggplot(internet_short, aes(x = Time, y = Country, fill = Users)) +
  geom_tile(color = "white", size = 0.3) + #Color of border between tiles and size of the color
  scale_x_continuous(expand = c(0, 0), name = NULL) +
  scale_y_discrete(name = NULL, position = "right")+
  scale_fill_viridis_c(
    option = "A", begin = 0.05, end = 0.98,
    limits = c(0,100), #Set limit on the legend
    name = "internet users / 100 people",
    guide = guide_colorbar(
      direction = "horizontal",
      label.position = "bottom",
      title.position = "top",
      barwidth = grid::unit(2.5, "in"),
      barheight = grid::unit(0.2, "in"), 
      ticks = FALSE)) +
  labs(
    caption = "Source: World Bank",
    title = "Percentage of internet users in 20 countries between 1999 and 2019") +
  theme(
    panel.background = element_blank(),
    axis.line = element_blank(),
    axis.ticks.length = grid::unit(1, "pt"),
    axis.ticks = element_blank(),
    legend.position = "bottom",
    legend.justification = "right",
    legend.title = element_text(family = font, color = text_color, size = 11),
    legend.title.align = 0.5,
    legend.box.spacing = unit(0, "pt"),
    legend.text = element_text(family = font, color = text_color, size = 9),
    axis.text = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, color = text_color, size = 13, face = "bold", hjust = 0, margin = margin(b = 10)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0))

1.6 Word cloud

# Read the text
HP <- readLines("/Users/huvi/Desktop/datasets/HP_Part1_Text.txt")
# Load as a corpus (a list of document, only 1 in this case)
docs <- Corpus(VectorSource(HP))

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
#docs <- tm_map(docs, toSpace, "/")
#docs <- tm_map(docs, toSpace, "@")
#docs <- tm_map(docs, toSpace, "—")

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2")) 
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
d <- d[-c(2,4,11,15,28,34,42,43),]

wordcloud2(data = d, 
           size = 1.5, 
           color = "random-dark",
           fontFamily = font)

2 Visualize distribution

2.1 Histogram

2.1.1 Single histogram

titanic = read.csv("/Users/huvi/Desktop/datasets/titanic.csv")

age_counts <- hist(titanic$Age, breaks = (0:16) * 5 + .01, plot = FALSE)$counts
age_hist <- data.frame(
  "Age_range" = c(c("0--5", "6--10", "11--15", "16--20", "21--25", "26--30", "31--35", "36--40", "41--45", "46--50", "51--55", "56--60", "61--65", "66--70", "71--75"), "76--80"),
  Count = age_counts,
  check.names = FALSE
)

age_hist = cbind(age_hist, age = (1:16) * 5 - 2.5)

ggplot(age_hist, aes(x = age, y = Count )) +
  geom_col(width = 4.7, fill = "#56B4E9") + 
  scale_y_continuous(expand = c(0,0), breaks = 25 * (0:6)) +
  scale_x_continuous(expand = c(0,0), name = "age(years)", limits = c(0,80), breaks = seq(0, 80, 10))+
  coord_cartesian(clip = "off") +
  labs(
    caption = "Source: Encyclopedia Titanica",
    title = "Number of passengers with known age on the Titanic") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0)
  )

2.1.2 Two histograms

ggplot(gender_counts, aes(x = age, y = ifelse(gender == "male", -1, 1)*count, fill = gender)) +
  geom_col() + 
  scale_y_continuous(
    expand = c(0,0), 
    breaks = 20 * (-5:3),
    name = "count",
    labels = c("100","80","60","40","20","0","20","40","60")) +
  scale_x_continuous(expand = c(0,0), name = "age(years)", limits = c(0,83), breaks = seq(0, 80, 20))+
  coord_flip() +
  scale_fill_manual(
    values = c("#D55E00", "#0072B2"),
    guide = "none") +
  labs(
    caption = "Source: Encyclopedia Titanica",
    title = "Numbers of passenger with known age on the Titanic") +
  draw_text(x = 70, y = -39, "male", hjust = 0, size = 11, family = font, color = text_color) +
  draw_text(x = 70, y = 21, "female", hjust = 0, size = 11, family = font, color = text_color) +
  theme(
    panel.background = element_blank(),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.2),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0))

2.2 Density plot

2.2.1 Single density

ggplot(titanic, aes(x = Age)) +
  geom_density_line(fill = "#56B4E9", color = "darkblue", alpha = 0.9, bw = 2, kernel = "gaussian") +
  scale_y_continuous(
    expand = c(0,0),
    limits = c(0, 0.045),
    name = "density") +
  scale_x_continuous(
    expand = c(0,0),
    limits = c(0,80),
    name = "age (years)") +
  coord_cartesian(clip = "off") +
  labs(
    caption = "Source: Encyclopedia Titanica",
    title = "Age distribution of passengers on the Titanic") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 10),
    axis.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(t = 20)),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0)
  )

2.2.2 Two densities

titanic2 <- titanic
titanic2$Sex <- factor(titanic2$Sex, levels = c("female", "male"))

ggplot(titanic2, aes(x = Age, y = ..count.., fill = Sex, color = Sex)) +
  geom_density_line(alpha = 0.6, bw = 2) +
  scale_y_continuous(
    expand = c(0,0),
    limits = c(0, 22),
    name = "scaled density") +
  scale_x_continuous(
    expand = c(0,0),
    limits = c(0,80),
    name = "age (years)") +
  scale_fill_manual(values = c("#0072B2", "#D55E00"), name = "gender") +
  scale_color_manual(values = c("darkblue","darkorange"), name = "gender")+
  guides(fill = guide_legend(override.aes = list(linetype = 0))) +
  coord_cartesian(clip = "off") +
  labs(
    caption = "Source: Encyclopedia Titanica",
    title = "Age distribution of passengers on the Titanic") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, size = 10, color = text_color, hjust = 0),
    legend.position = c(0.9, 0.8),
    legend.justification = c("right", "top"),
    legend.text = element_text(family = font, color = text_color, size = 11),
    legend.title = element_text(family = font, color = text_color, size = 11)
  )

ggplot(titanic2, aes(x = Age, y = ..count..)) +
  geom_density_line(
    data = dplyr::select(titanic, -Sex),
    aes(fill = "all passengers"),
    color = "transparent") +
  geom_density_line(
    aes(fill = Sex), 
    bw = 2, 
    color = "transparent") +
  scale_y_continuous(
    expand = c(0,0),
    limits = c(0, 30),
    name = "scaled density") +
  scale_x_continuous(
    expand = c(0,0),
    limits = c(0,80),
    name = "age (years)") +
  scale_fill_manual(
    values = c("#b3b3b3a0", "#0072B2", "#D55E00"),
    breaks = c("all passengers", "male", "female"),
    labels = c("all passeengers", "males", "females"),
    name = NULL,
    guide = guide_legend(direction = "horizontal")
    ) +
  coord_cartesian(clip = "off") +
  facet_wrap(~Sex, labeller = labeller(Sex = function(Sex) paste(Sex, "passengers"))) +
  labs(
    caption = "Source: Encyclopedia Titanica",
    title = "Age distribution of passengers on the Titanic") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 10),
    axis.title = element_text(family = font, color = text_color, size = 11),
    strip.text = element_text(size = 11),
    strip.background = element_blank(),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, size = 11, color = caption_color, hjust = 0),
    legend.position = "bottom",
    legend.justification = "right",
    legend.text = element_text(family = font, color = text_color, size = 10),
    legend.spacing.x = grid::unit(4.5, "pt"),
    panel.spacing = grid::unit(1, "lines")
  )

2.2.3 Multiple densities

Cows %>% 
  mutate (breed = as.character(breed)) %>% 
  filter(breed != "Canadian") -> cows_filtered

# compute densities for sepal lengths
cows_dens <- group_by(cows_filtered, breed) %>% 
  do(ggplot2:::compute_density(.$butterfat, NULL)) %>% 
  rename(butterfat = x)

# get the maximum values
cows_max <-filter(cows_dens, density == max(density)) %>% 
  ungroup() %>% 
  mutate(
    hjust = c(0, 0, 0, 0),
    vjust = c(0, 0, 0, 0),
    nudge_x = c(-0.2, -0.2, 0.1, 0.23),
    nudge_y = c(0.03, 0.03, -0.2, -0.06))

ggplot(cows_dens, aes(x = butterfat, y = density, color = breed, fill = breed)) + 
  geom_density_line(stat = "identity") +
  geom_text(
    data = cows_max,
    aes(
      label = breed, hjust = hjust, vjust = vjust,
      color = breed,
      x = butterfat + nudge_x, 
      y = density + nudge_y,
      family = font,
      size = 11
    ),
    inherit.aes = FALSE,
    size = 12/.pt
  ) +
  scale_color_manual(
    values = darken(c("#56B4E9", "#E69F00", "#D55E00", "#009E73"), 0.3),
    breaks = c("Ayrshire", "Guernsey", "Holstein-Friesian", "Jersey"),
    guide = "none"
  ) +
  scale_fill_manual(
    values = c("#56B4E950", "#E69F0050", "#D55E0050", "#009E7350"),
    breaks = c("Ayrshire", "Guernsey", "Holstein-Friesian", "Jersey"),
    guide = "none"
  ) +
  scale_x_continuous(
    expand = c(0, 0),
    labels = scales::percent_format(accuracy = 1, scale = 1),
    name = "butterfat contents"
  ) +
  scale_y_continuous(limits = c(0, 1.99), expand = c(0, 0)) +
  coord_cartesian(clip = "off") +
  labs(
    caption = "Source: Canadian Record of Performance for Purebred Dairy Cattle",
    title = "Butterfat percentage in the milk of four cattle breeds") +
  theme(
    panel.background = element_blank(),
    panel.grid.major.y = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0))

3 Visualize correlation

3.1 Scatterplot

3.1.1 Basic scatterplot

data(BlueJays)
ggplot (BlueJays, aes(x = Mass, y = Head, fill = KnownSex)) + 
  geom_point(pch = 21, color = "white", size = 2.5) +
  scale_x_continuous(name = "body mass (g)") +
  scale_y_continuous(name = "head length (mm)") +
  scale_fill_manual(
    values = c(F = "#D55E00", M = "#0072B2"),
    breaks = c("F", "M"),
    labels = c("female birds", "male birds"),
    name = NULL,
    guide = guide_legend(
      direction = "horizontal",
      override.aes = list(size = 3))) +
  labs(
    caption = "Source: Keith Tarvin, Oberlin College",
    title = "Head length versus body mass for 123 blue jays") +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    axis.title = element_text(family = font, size = 11, color = text_color),
    panel.background = element_rect(fill = "white"),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.3),
    axis.title.x = element_text(margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2),
    legend.position = "top",
    legend.justification = "right",
    legend.box.spacing = unit(3.5, "pt"), #distance between legend and plot
    legend.text = element_text(family = font, color = text_color, vjust = 0.6),
    legend.spacing.x = unit(2, "pt"),
    legend.background = element_rect(fill = "white", color = "white"),
    legend.key.width = unit(10, "pt"),
    legend.key = element_blank())

3.1.2 All-against-all scatterplot

bj_matrix <- BlueJays %>% 
  dplyr::select(BirdID, KnownSex, Head, Mass, Skull) %>% 
  gather(var_x, val_x, Head:Skull) %>% 
  left_join(dplyr::select(BlueJays, BirdID, Head, Mass, Skull)) %>% 
  gather(var_y, val_y, Head:Skull)

labels <- c(
  Head = "head length (mm)",
  Mass = "body mass (g)", 
  Skull = "skull size (mm)"
)

ggplot (bj_matrix, aes(x = val_x, y = val_y, fill = KnownSex)) + 
  geom_point(pch = 21, color = "white", size = 2, stroke = 0.2) +
  scale_x_continuous(
    expand_scale(mult = 0.1),
    breaks = scales::pretty_breaks(4, min.n = 3)) +
  scale_y_continuous(
    expand_scale(mult = 0.1),
    breaks = scales::pretty_breaks(4, min.n = 3)) +
  scale_fill_manual(
    values = c(F = "#D55E00", M = "#0072B2"),
    breaks = c("F", "M"),
    labels = c("female birds", "male birds"),
    name = NULL,
    guide = guide_legend(
      direction = "horizontal",
      override.aes = list(size = 2.5))) +
  labs(
    caption = "Source: Keith Tarvin, Oberlin College",
    title = "All-against-all scattereplot for 123 blue jays",
    x = NULL,
    y = NULL) +
  facet_grid(
    var_y ~ var_x,
    scales = "free",
    switch = "both", 
    labeller = labeller(
      var_x = labels, 
      var_y = labels
    )
  ) +
  coord_cartesian(clip = "off") +
  panel_border(colour = "grey85", size = 0.4) +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 10, color = text_color),
    axis.title = element_blank(),
    panel.background = element_rect(fill = "white"),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.5),
    plot.title = element_text(family = font, color = text_color, size = 14, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2, margin = margin(t = 20)),
    legend.position = "top",
    legend.justification = "right",
    legend.box.spacing = grid::unit(1, "pt"), #distance between legend and plot
    legend.text = element_text(family = font, color = text_color, vjust = 0.6),
    legend.spacing.x = unit(2, "pt"),
    legend.background = element_rect(fill = "white", color = "white"),
    legend.key.width = unit(10, "pt"),
    legend.key = element_blank(),
    strip.background = element_blank(),
    strip.placement = "outside",
    strip.text.x = element_text(family = font, color = text_color, vjust = 1, margin = margin(0,0,0,0)),
    strip.text.y = element_text(family = font, color = text_color, vjust = 0, angle = -90, margin = margin(0,3.5,0,0))
    )

3.2 Bubble chart

BlueJays$sex <- ifelse(BlueJays$KnownSex == "F", "female birds", "male birds")
BlueJays$sex <- factor(BlueJays$sex, levels = c("female birds", "male birds"))

ggplot (BlueJays, aes(x = Mass, y = Head, fill = KnownSex, size = Skull)) + 
  geom_point(pch = 21, color = "white") +
  facet_wrap(~sex, ncol = 2, scales = "fixed")+
  scale_x_continuous(name = "body mass (g)") +
  scale_y_continuous(name = "head length (mm)", breaks = c(52, 54, 56, 58, 60)) +
  scale_fill_manual(
    values = c(F = "#D55E00", M = "#0072B2"),
    breaks = c("F", "M"),
    labels = c("female birds", "male birds"),
    name = NULL,
    guide = "none") + 
  scale_radius(
    name = "skull size (mm)",
    range = c(2, 7),
    limits = c(28, 34),
    breaks = c(28, 30, 32, 34),
    labels = c("28   ", "30   ", "32   ", "34"),
    guide = guide_legend(
      direction = "horizontal",
      title.position = "top",
      title.hjust = 0.5,
      label.position = "right",
      override.aes = list(fill = "gray40"))) +
  labs(
    caption = "Source: Keith Tarvin, Oberlin College",
    title = "Bubble chart for 123 blue jays",
    x = NULL,
    y = NULL) +
  theme(
    panel.background = element_rect(fill = "white"),
    panel.grid.major = element_line(color = "#cbcbcb", size = 0.5),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 10, color = text_color),
    axis.title = element_text(family = font, size = 10, color = text_color),
    axis.title.x = element_text(margin = margin (t = 10)),
    legend.position = c(1, 0),
    legend.justification = c(1,0),
    legend.spacing.x = unit(2, "pt"),
    legend.spacing.y = unit(2, "pt"),
    legend.text = element_text(family = font, color = text_color, size = 10, vjust = 0.6),
    legend.title = element_text(family = font, color = text_color, size = 10),
    legend.background = element_rect(fill = "white", color = "white"),
    legend.key.width = unit(5, "pt"),
    legend.key = element_blank(),
    strip.text = element_text(family = font, color = text_color, size = 10, margin = margin(2, 0, 2, 0)),
    strip.background  = element_rect(
      fill = "grey85", colour = "grey85",
      linetype = 1, size = 0.25),
    plot.title = element_text(family = font, color = text_color, size = 14, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2, margin = margin(t = 10)))

3.3 Correlogram

data <- cor(mtcars[,1:7])
corre <- melt(data)

ggplot(filter(corre, as.integer(Var1) < as.integer(Var2)), aes(Var1, Var2, fill = value)) + 
  geom_tile(color = "white", size = 1) + 
  scale_x_discrete(position = "top", name = NULL, expand = c(0, 0)) +
  scale_y_discrete(name = NULL, expand = c(0, 0)) +
  scale_fill_continuous_divergingx(
    palette = "Tropic", rev = FALSE,
    limits = c(-0.9, 0.91),
    breaks = c(-0.9, 0, 0.9),
    labels = c("–0.9", "0", "0.9"),
    name = "correlation",
    guide = guide_colorbar(
      direction = "horizontal",
      label.position = "bottom",
      title.position = "top",
      barwidth = grid::unit(140, "pt"),
      barheight = grid::unit(17.5, "pt"),
      ticks.linewidth = 1
    )
  ) +
  coord_fixed() +
  labs(
    caption = "Source: mtcars",
    title = "Correlation in automobile aspects") +
  theme(
    panel.background = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    axis.ticks.length = grid::unit(3, "pt"),
    legend.position = c(.97, .0),
    legend.justification = c(1, 0),
    legend.title.align = 0.5,
    legend.text = element_text(family = font, color = text_color, size = 11),
    legend.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold", margin = margin(b = 10)),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0)
  )

ggplot(filter(corre, as.integer(Var1) < as.integer(Var2)), aes(Var1, Var2, fill = value, size = abs(value))) + 
  geom_point(shape = 21, stroke = 0) + 
  scale_x_discrete(position = "top", name = NULL, expand = c(0, 0.5)) +
  scale_y_discrete(name = NULL, expand = c(0, 0.5)) +
  scale_size_area(max_size = 19, limits = c(0, 0.91), guide = "none")+
  scale_fill_continuous_divergingx(
    palette = "PuOr", rev = FALSE,
    limits = c(-0.9, 0.91),
    breaks = c(-0.9, 0, 0.9),
    labels = c("–0.9", "0", "0.9"),
    name = "correlation",
    guide = guide_colorbar(
      direction = "horizontal",
      label.position = "bottom",
      title.position = "top",
      barwidth = grid::unit(140, "pt"),
      barheight = grid::unit(17.5, "pt"),
      ticks.linewidth = 1
    )
  ) +
  coord_fixed() +
  labs(
    caption = "Source: mtcars",
    title = "Correlation in automobile aspects",
    subtitle = "The size of circles denotes the magnitude of each correlation") +
  theme(
    panel.background = element_blank(),
    axis.text = element_text(family = font, color = text_color, size = 11),
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    axis.ticks.length = grid::unit(3, "pt"),
    legend.position = c(.97, .0),
    legend.justification = c(1, 0),
    legend.title.align = 0.5,
    legend.text = element_text(family = font, color = text_color, size = 11),
    legend.title = element_text(family = font, color = text_color, size = 11),
    plot.title = element_text(family = font, size = 13, color = text_color, face = "bold"),
    plot.subtitle = element_text(family = font, size = 11, color = text_color, hjust = 0),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0)
  )

3.4 Slopegraph

co2_emissions <- read.csv("/Users/huvi/Desktop/datasets/annual-co2-emissions-per-country.csv", sep= ",")

emissions_data <- co2_emissions %>%
  dplyr::select(country = Entity, year = Year, co2 = Annual.CO2.emissions) %>% 
  mutate(co2 = co2/1000000, year = as.numeric(year)) %>% 
  filter(
    country %in% c("Trinidad and Tobago", "Qatar", "United Arab Emirates", "Oman", "Bahrain", "Singapore", "Netherlands", "Kazakhstan", "Equatorial Guinea", "Kuwait"),
    year %in% c(2000, 2005, 2010))

labels <- 
  tibble(filter(emissions_data, year == 2010))
labels$nudge = c(.1, .1, .1, .1, .1, -4, .1, 3, 4, .1)

ggplot(filter(emissions_data, year != 2005), aes(x = year, y = co2)) +
  geom_line(aes(group = country), color = "gray60") +
  geom_point(color = "white", size = 3) +
  geom_point(color = "#0072B2", size = 2) +
  geom_text(
    data = labels,
    aes(
      x = 2010 + 0.45, 
      y = co2 + nudge,
      label = country
    ),
    family = font,
    size = 10/.pt,
    hjust = 0
  ) +
  scale_x_continuous(
    limits = c(2000, 2020),
    breaks = c(2000, 2010),
    labels = c("2000", "2010"),
    expand = expand_scale(add = c(1, 0)),
    name = NULL,
    position = "top"
  ) +
  scale_y_continuous(
    limits = c(-2, 260),
    expand = c(0, 0), 
    name = parse(text = "`CO`[2]*` emissions (tons / person)`")
  ) +
  labs(
      caption = "Source: World Bank",
      title = "CO2 emissions per persion in 2000 and 2010 for 10 countries") +
  theme(
    panel.background = element_blank(),
    axis.line.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(family = font, size = 11, color = text_color),
    axis.title.y = element_text(family = font, size = 11, color = text_color),
    axis.text.y = element_text(family = font, size = 11, color = text_color),
    axis.line.y.left = element_line(color = text_color),
    plot.title = element_text(family = font, size = 14, color = text_color, face = "bold", margin = margin(b = 10), hjust = -0.1 ),
      plot.caption = element_text(family = font, size = 10, color = caption_color, margin = margin(t = 10), hjust = -0.06)
  )

ggplot(emissions_data, aes(x = year, y = co2)) +
  geom_line(aes(group = country), color = "gray60") +
  geom_point(color = "white", size = 3) +
  geom_point(color = "#0072B2", size = 2) +
  geom_text(
    data = labels,
    aes(
      x = 2010 + 0.45, 
      y = co2 + nudge,
      label = country
    ),
    family = font,
    size = 10/.pt,
    hjust = 0
  ) +
  scale_x_continuous(
    limits = c(2000, 2020),
    breaks = c(2000, 2005, 2010),
    labels = c("2000", "2005", "2010"),
    expand = expand_scale(add = c(1, 0)),
    name = NULL,
    position = "top"
  ) +
  scale_y_continuous(
    limits = c(-2, 260),
    expand = c(0, 0), 
    name = parse(text = "`CO`[2]*` emissions (tons / person)`")
  ) +
  labs(
      #caption = "Source: World Bank",
      #title = "CO2 emissions per persion from 2000 to 2010 for 10 countries"
    ) +
  theme(
    panel.background = element_blank(),
    axis.line.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(family = font, size = 11, color = text_color),
    axis.title.y = element_text(family = font, size = 11, color = text_color),
    axis.text.y = element_text(family = font, size = 11, color = text_color),
    axis.line.y.left = element_line(color = text_color),
    plot.title = element_text(family = font, size = 14, color = text_color, face = "bold", margin = margin(b = 10), hjust = -0.2),
      plot.caption = element_text(family = font, size = 10, color = caption_color, margin = margin(t = 10), hjust = -0.06)
  )

4 Visualization proportion

4.1 Waffle chart

data <- data.frame(
  names = c("China", "India", "USA", "Other countries"),
  vals = c(14, 14, 3, 43)
)
waffle(
  data, 
  rows = 6,
  colors = c("#8dd3c7", "#80b1d3", "#fdb462", "#fb8072"),
  xlab = "1 square = 10 million people",
  title = "Word population in 2021")

4.2 Pie chart

# Create Data
data <- data.frame(
  group=LETTERS[1:5],
  value=c(13,7,9,21,2)
)
data <- data %>% 
  arrange(value) %>% 
  mutate(
    total = sum(value),
    end_angle = 2*pi*cumsum(value)/total, #ending angle for each pie slice
    start_angle = lag(end_angle, default = 0), #starting angle for each pie slice
    mid_angle = 0.5*(start_angle + end_angle), #middle angle for each pie slice, for text label
    hjust = ifelse(mid_angle > pi, 1, 0),
    vjust = ifelse(mid_angle<pi/2 | mid_angle>3*pi/2, 0, 1)
  ) 
rpie = 1
rlabel = 1.05*rpie

ggplot(data) +
  geom_arc_bar(
    aes(
      x0 = 0, y0 = 0, r0 = 0, r = rpie,
      start = start_angle, end = end_angle, fill = group
    ),
    color = "white",
    size = 0.5
  ) +
  geom_text( #label each pie
    aes(
      x = rlabel*sin(mid_angle),
      y = rlabel*cos(mid_angle),
      label = group,
      hjust = hjust, vjust = vjust
    ),
    family = font, size = 7
  ) +
  geom_text(#label value of each pie
    aes(
      x = 0.6*sin(mid_angle),
      y = 0.6*cos(mid_angle),
      label = value
    ),
    family = font, size = 7
  ) +
  coord_fixed(clip = "off") +
  scale_x_continuous(
    expand = c(0, 0),
    name = "",
    breaks = NULL,
    labels = NULL
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    name = "",
    breaks = NULL,
    labels = NULL
  ) +
  scale_fill_manual(
    values = c("#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0")
  ) +
  theme(
    panel.background = element_blank(),
    legend.position = "none",
    plot.margin = margin(t = 20, r = 20)
  )

4.3 Donut chart

data <- data.frame(
  category=c("A", "B", "C"),
  count=c(10, 60, 30)
)
# Compute percentages
data$fraction = data$count / sum(data$count)
# Compute the cumulative percentages (top of each rectangle)
data$ymax = cumsum(data$fraction)
# Compute the bottom of each rectangle
data$ymin = c(0, head(data$ymax, n=-1))
# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2
# Compute a good label
data$label <- paste0(data$category, "\n value: ", data$count)

# Make the plot
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
  geom_rect() +
  coord_polar(theta="y") + # Try to remove that to understand how the chart is built 
  xlim(c(2, 4)) + # Try to remove that to see how to make a pie chart
  geom_label(x = 3.5, aes(y = labelPosition, label = label), size = 6, family = font) +
  scale_fill_manual(
    values = c("#66c2a5", "#fc8d62", "#8da0cb")
  ) +
  theme(
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    axis.text = element_blank(),
    legend.position = "none",
    plot.margin = margin(t = 20, r = 20)
  )

4.4 Mosaic plot

colNames<-c("IDENTIF","RIVER","LOCATION","ERECTED","PURPOSE","LENGTH","LANES","CLEAR-G","T-OR-D","MATERIAL","SPAN","REL-L","TYPE")

bridge <- read.csv("/Users/huvi/Desktop/datasets/bridges.data.version.csv", col.names = colNames, na.strings=c("NA", "-", "?"," "))%>%
  dplyr::select(ERECTED, material = MATERIAL) %>%
  mutate(
    erected = case_when(
      ERECTED < 1870 ~ "craft",
      ERECTED >= 1870 & ERECTED <= 1889 ~ "emerging",
      ERECTED >= 1890 & ERECTED <= 1939 ~ "mature",
      ERECTED > 1940 ~ "modern"))

bridge %>%
  dplyr::select(material, erected) %>% 
  table() %>% 
  reshape2::melt() %>% 
  dplyr::select(material, erected, count = value) %>%
  mutate(
    material = case_when(
      material == "IRON" ~ "iron",
      material == "STEEL" ~ "steel",
      material == "WOOD" ~ "wood")
    ) %>%
  group_by(erected)%>%
  mutate(group_count = sum(count))-> bridges_tidy
  
labels_df <- group_by(bridges_tidy, erected) %>%
  filter(count != 0) %>%
  arrange(desc(material)) %>%
  mutate(
    y = (cumsum(count) - 0.5*count)/group_count,
    y = ifelse(
      erected == "mature" & material == "wood", NA, y
    )
  )

ggplot(bridges_tidy) +
  aes(x = erected, y = count, width = group_count, fill = material) +
  geom_bar(stat = "identity", position = "fill", colour = "white", size = 1) +
  facet_grid(~erected, scales = "free_x", space = "free_x") +
  geom_text(
    data = labels_df,
    aes(y = y, label = count, color = material),
    na.rm = TRUE,
    size = 12/.pt,
    family = font
  ) +
  scale_y_continuous(
    name = NULL) +
  scale_x_discrete(
    name = NULL) +
  scale_fill_manual(
    values = c("#D55E00D0", "#0072B2D0", "#009E73D0")
  ) +
  scale_color_manual(
    values = c(iron = "white", wood = "white", steel = "white")
  ) +
  labs( title = "Mosaic plot for 106 bridges in Pittsburgh",
        subtitle = "The widths of each rectangle are proportional to the number of bridges built in that area
The heights of each rectangle are proportional to the number of bridges built from that material
Numbers show the counts of bridges in each category",
        caption = "Source: Yoram Reich and Steven J. Fenves") +
  coord_cartesian(clip = "off") +
  theme(
    panel.background = element_blank(),
    line = element_blank(),
    strip.text = element_blank(),
    axis.ticks.length = unit(0, "pt"),
    axis.text.y = element_blank(),
    axis.text.x = element_text(family = font, size = 11, color = text_color),
    panel.spacing.x = unit(0, "pt"),
    plot.title = element_text(family = font, size = 15, color = text_color, face = "bold"),
    plot.subtitle = element_text(family = font, size = 11, color = text_color),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0, margin = margin(t = 10)),
    legend.position = "top",
    legend.justification = c(0, 0),
    legend.title = element_blank(),
    legend.background = element_blank(),
    legend.key = element_blank(),
    legend.text = element_text(family = font, size = 11, color = text_color),
    legend.margin = margin(b = -20, l = -2)
    )

4.5 Treemap

colNames<-c("IDENTIF","RIVER","LOCATION","ERECTED","PURPOSE","LENGTH","LANES","CLEAR-G","T-OR-D","MATERIAL","SPAN","REL-L","TYPE")

bridge <- read.csv("/Users/huvi/Desktop/datasets/bridges.data.version.csv", col.names = colNames, na.strings=c("NA", "-", "?"," "))%>%
  dplyr::select(ERECTED, material = MATERIAL) %>%
  mutate(
    erected = case_when(
      ERECTED < 1870 ~ "craft",
      ERECTED >= 1870 & ERECTED <= 1889 ~ "emerging",
      ERECTED >= 1890 & ERECTED <= 1939 ~ "mature",
      ERECTED > 1940 ~ "modern"))

bridge %>%
  dplyr::select(material, erected) %>% 
  table() %>% 
  reshape2::melt() %>% 
  dplyr::select(material, erected, count = value) %>%
  mutate(
    material = case_when(
      material == "IRON" ~ "iron",
      material == "STEEL" ~ "steel",
      material == "WOOD" ~ "wood")
    ) %>%
  group_by(erected)%>%
  mutate(group_count = sum(count))-> bridges_tidy
  
labels_df <- group_by(bridges_tidy, erected) %>%
  filter(count != 0) %>%
  arrange(desc(material)) %>%
  mutate(
    y = (cumsum(count) - 0.5*count)/group_count,
    y = ifelse(
      erected == "mature" & material == "wood", NA, y
    )
  )

filcols <- c("#D55E00D0", "#0072B2D0", "#009E73D0")
filcols <- c(vapply(filcols, function(x) c(lighten(x, .9), lighten(x, .6), lighten(x, .3), x), character(4)))

ggplot(bridges_tidy, aes(area = count, subgroup = material, fill = interaction(erected, material))) +
  geom_treemap(color = "white", size = 0.5*.pt, alpha = NA) + 
  geom_treemap_subgroup_text(
    family = font,
    colour = "grey50",
    place = "centre", alpha = 0.7,
    grow = TRUE
  ) +
  geom_treemap_subgroup_border(color = "white") +
  geom_treemap_text(
    aes(label = erected, color = interaction(erected, material)),
    family = font,
    place = "centre",
    grow = FALSE
  ) +
  scale_fill_manual(values = filcols) +
  scale_color_manual(values = c(
    crafts.iron = "black", crafts.steel = "black", crafts.wood = "black",
    emerging.iron = "black", emerging.steel = "black", emerging.wood = "black",
    mature.iron = "black", mature.steel = "black", mature.wood = "black",
    modern.iron = "white", modern.steel = "white", modern.wood = "white")
  ) +
  coord_cartesian(clip = "off") +
  guides(colour = "none", fill = "none")+
  labs( title = "Treemap plot for 106 bridges in Pittsburgh",
        subtitle = "The area of each rectangle is proportional to the number of bridges of that type",
        caption = "Source: Yoram Reich and Steven J. Fenves") +
  theme(
    plot.title = element_text(family = font, size = 15, color = text_color, face = "bold"),
    plot.subtitle = element_text(family = font, size = 11, color = text_color),
    plot.caption = element_text(family = font, size = 10, color = caption_color, hjust = 0, margin = margin(t = 10))
    )

4.6 Parallel sets plot

colNames<-c("IDENTIF","RIVER","LOCATION","ERECTED","PURPOSE","LENGTH","LANES","CLEAR-G","T-OR-D","MATERIAL","SPAN","REL-L","TYPE")

bridge <- read.csv("/Users/huvi/Desktop/datasets/bridges.data.version.csv", col.names = colNames, na.strings=c("NA", "-", "?"," "))%>%
  mutate(
    erected = case_when(
      ERECTED < 1870 ~ "CRAFTS",
      ERECTED >= 1870 & ERECTED <= 1889 ~ "EMERGING",
      ERECTED >= 1890 & ERECTED <= 1939 ~ "MATURE",
      ERECTED > 1940 ~ "MODERN"),
    length = case_when(
      LENGTH < 1500 ~ "SHORT",
      ERECTED >= 1500 & ERECTED <= 3000 ~ "MEDIUM",
      ERECTED > 3000 ~ "LONG"))

bridge %>% 
  dplyr::select(material = MATERIAL, erected, river = RIVER, length) %>% 
  filter(river != "Y") %>% 
  table() %>% 
  reshape2::melt() %>% 
  rename(count = value) %>%
  mutate(
    material = factor(
      case_when(
        material == "IRON" ~ "iron",
        material == "STEEL" ~ "steel",
        material == "WOOD" ~ "wood"
      ),
      levels = c("wood", "steel", "iron")
    ),
    erected = factor(
      case_when(
        erected == "CRAFTS" ~ "crafts",
        erected == "EMERGING" ~ "emerging",
        erected == "MATURE" ~ "mature",
        erected == "MODERN" ~ "modern"
      ),
      levels = c("modern", "mature", "emerging", "crafts")
    ),
    length = factor(
      case_when(
        length == "LONG" ~ "long",
        length == "MEDIUM" ~ "medium",
        length == "SHORT" ~ "short"
      ),
      levels = c("short", "medium", "long")
    ),
    river = factor(
      case_when(
        river == "A" ~ "Allegheny",
        river == "M" ~ "Monongahela",
        river == "O" ~ "Ohio"
      ),
      levels = c("Ohio", "Monongahela", "Allegheny")
    )
  ) -> data

data <- gather_set_data(data, 1:4)
data$x <- factor(data$x, levels = c("material", "length", "erected", "river"))

ggplot(data, aes(x, id = id, split = y, value = count)) +
  geom_parallel_sets(aes(fill = material), alpha = 0.5, axis.width = 0.13) +
  geom_parallel_sets_axes(axis.width = 0.1, fill = "grey80", color = "grey80") +
  geom_parallel_sets_labels(
    color = 'black',
    family = font,
    size = 10/.pt,
    angle = 90
  ) +
  scale_x_discrete(
    name = NULL,
    expand = c(0, 0.2)
  ) +
  scale_y_continuous(breaks = NULL, expand = c(0, 0))+
  scale_fill_manual(
    values = c(iron = "#D55E00D0", wood = "#009E73D0", steel = "#0072B2D0"),
    guide = "none"
  ) +
  labs( title = "Parallel sets plot for 106 bridges in Pittsburgh",
        subtitle = "The coloring of the bands highlights the construction material of different bridges",
        caption = "Source: Yoram Reich and Steven J. Fenves") +
  theme(
    panel.background = element_blank(),
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    axis.text = element_text(family = font, size = 11, color = text_color),
    strip.text = element_text(family = font, size = 11, color = text_color),
    plot.margin = margin(14, 1.5, 2, 1.5),
    plot.title = element_text(family = font, size = 15, color = text_color, face = "bold", hjust = 0.13),
    plot.subtitle = element_text(family = font, size = 11, color = text_color, hjust = 0.22),
    plot.caption = element_text(family = font, size = 10, color = caption_color, margin = margin(t = 10, b = 5), hjust = 0.05)
  )

5 Visualize revolution

5.1 Line chart

5.1.1 Single variable

covid <- read.csv("/Users/huvi/Desktop/datasets/owid-covid-data.csv") 

ger <- covid %>% filter(iso_code == "DEU") %>% 
  dplyr::select(date, new_cases_per_million) %>% 
  mutate(
    date = as.Date(date),
    MA7 = zoo::rollmean(new_cases_per_million, k = 7, fill = NA))

ggplot(ger, aes(x = date, y = MA7))+ 
  geom_line(color = "#0072B2", size = 0.6)+
  scale_y_continuous(
    limit = c(0, 350),
    expand = c(0, 0),
    breaks = c(0, 50, 100, 150, 200, 250, 300),
    name = "cases/million people"
  ) +
  scale_x_date(
    name = "date"
  ) +
  labs(
    caption = "Source: ourworldindata.com",
    title = "7-day moving average of new COVID cases in Germany") +
  theme(
    axis.text = element_text(family = font, size = 11, color = text_color),
    axis.title = element_text(family = font, size = 11, color = text_color),
    axis.line = element_line(color = text_color, size = 0.4),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    axis.title.x = element_text(margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2),
  )

5.1.2 Multiple variables

vaccine <- covid %>% filter(location %in% c("United Kingdom", "Germany", "Israel", "United States", "United Arab Emirates", "Canada" )) %>% 
  dplyr::select(date, location, total_vaccinations_per_hundred) %>% 
  mutate(
    date = as.Date(date)
    ) %>% 
  drop_na()

ggplot(vaccine, aes(x = date, y = total_vaccinations_per_hundred, color = location))+ 
  geom_line(size = 0.6)+
  scale_y_continuous(
    limit = c(0, 125),
    expand = c(0, 0),
    breaks = c(0, 25, 50, 75, 100, 125),
    name = "vaccinations/hundred people"
  ) +
  scale_x_date(
    name = "date"
  ) +
  scale_color_manual(
    values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#6a3d9a")
  ) +
  labs(
    caption = "Source: ourworldindata.com",
    title = "Number of vaccinations per hundred people in 6 countries") +
  theme(
    axis.text = element_text(family = font, size = 11, color = text_color),
    axis.title = element_text(family = font, size = 11, color = text_color),
    axis.line = element_line(color = text_color, size = 0.4),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    axis.title.x = element_text(margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2),
    legend.position = "top",
    legend.box.spacing = unit(3.5, "pt"), #distance between legend and plot
    legend.text = element_text(family = font, color = text_color, vjust = 0.6),
    legend.spacing.x = unit(2, "pt"),
    legend.background = element_rect(fill = "white", color = "white"),
    legend.key.width = unit(10, "pt"),
    legend.key = element_blank(),
    legend.title = element_blank()
    )

5.2 Area chart

ggplot(ger, aes(x = date, height = MA7, y = 0))+ 
  geom_ridgeline(color = "#0072B2", fill = "#0072B240", size = 0.75)+
  scale_y_continuous(
    limit = c(0, 350),
    expand = c(0, 0),
    breaks = c(0, 50, 100, 150, 200, 250, 300),
    name = "cases/million people"
  ) +
  scale_x_date(
    name = "date"
  ) +
  labs(
    caption = "Source: ourworldindata.com",
    title = "7-day moving average of new COVID cases in Germany") +
  theme(
    axis.text = element_text(family = font, size = 11, color = text_color),
    axis.title = element_text(family = font, size = 11, color = text_color),
    axis.line = element_line(color = text_color, size = 0.4),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    axis.title.x = element_text(margin = margin (t = 10)),
    plot.title = element_text(family = font, color = text_color, size = 15, face = "bold", margin = margin(b = 20)),
    plot.caption = element_text(family = font, color = caption_color, size = 9, hjust = 0, vjust = 2),
  )

Data Visualization Collection

Vinh Hung Le

1 Visualize amount

1.1 Dot plot/lollipop chart

1.1.1 One group

1.1.2 Change baseline

1.2 Dumbbell chart

1.3 Radar chart

1.3.1 One observation

1.3.2 Multiple observations 1

1.3.3 Multiple observations 2

1.4 Bar plot

1.4.1 Vertical bars

1.4.2 Horizontal bars

1.4.3 Grouped bars

1.4.4 Stacked Bars

1.5 Heatmap

1.6 Word cloud

2 Visualize distribution

2.1 Histogram

2.1.1 Single histogram

2.1.2 Two histograms

2.2 Density plot

2.2.1 Single density

2.2.2 Two densities

2.2.3 Multiple densities

3 Visualize correlation

3.1 Scatterplot

3.1.1 Basic scatterplot

3.1.2 All-against-all scatterplot

3.2 Bubble chart

3.3 Correlogram

3.4 Slopegraph

4 Visualization proportion

4.1 Waffle chart

4.2 Pie chart

4.3 Donut chart

4.4 Mosaic plot

4.5 Treemap

4.6 Parallel sets plot

5 Visualize revolution

5.1 Line chart

5.1.1 Single variable

5.1.2 Multiple variables

5.2 Area chart