Obtain data

This blogpost walks through how to obtain data about albums https://medium.com/@caineosborne/analysing-uk-chart-history-1956-to-2017-6fec0ecc991b

This gives the Top 20 albums on 1961-07-02

http://www.officialcharts.com/charts/albums-chart/19610702/7502/

This gives the Top 100 singles for 1999-12-26 to 2000-01-01

https://www.officialcharts.com/charts/singles-chart/19991226/7501/

Let’s generate the dates from then until now

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 1.0.0 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(rvest)


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

library(furrr)

Loading required package: future

dates_charts <- seq(ymd("1999-12-26"), ymd("2023-01-01"), "7 days")

This function will extract the data:

get_week_top100_data <- function(week_start) {
  
  raw_date = str_remove_all(week_start, "-")
  
  page_results <-
    str_glue("https://www.officialcharts.com/charts/singles-chart/{raw_date}/7501/") %>%
    read_html()
  
  
  page_results %>%
    html_table() %>%
    .[[1]] %>%
    clean_names() %>%
    filter(pos %in% as.character(1:100)) %>%
    select(1:3) %>%
    separate(title_artist,
             sep = "\r\n                        \r\n                        \r\n\r\n                            ",
             into = c("title", "artist_label")) %>%
    separate(artist_label,
             sep = "\r\n                        \r\n                        ",
             into = c("artist", "label")) %>%
    set_names(c(
      "position_current",
      "position_previous",
      "title",
      "artist",
      "label"
    )) %>% 
    mutate(date_week_start = week_start) %>% 
    relocate(date_week_start)
}

dates_charts[1] %>% 
  get_week_top100_data()

# A tibble: 100 × 6
   date_week_start position_current position_previous title         artist label
   <date>          <chr>            <chr>             <chr>         <chr>  <chr>
 1 1999-12-26      1                1                 I HAVE A DRE… WESTL… RCA  
 2 1999-12-26      2                2                 THE MILLENNI… CLIFF… PAPI…
 3 1999-12-26      3                3                 IMAGINE       JOHN … PARL…
 4 1999-12-26      4                8                 MR. HANKEY T… MR HA… COLU…
 5 1999-12-26      5                6                 RE-REWIND TH… ARTFU… RELE…
 6 1999-12-26      6                5                 TWO IN A MIL… S CLU… POLY…
 7 1999-12-26      7                4                 COGNOSCENTI … CUBAN… EMI  
 8 1999-12-26      8                7                 SAY YOU'LL B… STEPS  JIVE 
 9 1999-12-26      9                10                KISS (WHEN T… VENGA… POSI…
10 1999-12-26      10               11                BACK IN MY L… ALICE… POSI…
# … with 90 more rows

Now let’s map this

plan(multisession)
data_raw_all_charts <- dates_charts %>% 
  future_map_dfr(get_week_top100_data)

data_raw_all_charts %>% 
  write_csv(here::here("posts", "2023-02-06_dataviz_top-10-sankey", "data_raw_all_charts.csv"))

Explore and tidy up

data_raw_all_charts <- read_csv(here::here("posts", "2023-02-06_dataviz_top-10-sankey", "data_raw_all_charts.csv"))

Rows: 119900 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): position_previous, title, artist, label
dbl  (1): position_current
date (1): date_week_start

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

There are ZERO NA values!!!

data_charts_with_ids <- data_raw_all_charts %>% 
  group_by(title, artist) %>% 
  mutate(id_title_artist = cur_group_id()) %>%
  group_by(title, artist, label) %>% 
  mutate(id_title_artist_label = cur_group_id()) %>% 
  ungroup()

There are 399 more unique songs when splitting the data by Title, Artist and Label.

After looking at these everything feel okay. I can go with title and artist ids.

data_charts_with_ids %>% 
  group_by(title, artist) %>% 
  mutate(n_labels = n_distinct(label)) %>% 
  ungroup() %>% 
  filter(n_labels > 1) %>% 
  select(title, artist, label, contains("id"), n_labels) %>% 
  unique() %>% 
  arrange(title, artist)

# A tibble: 784 × 6
   title             artist       label            id_title_ar…¹ id_ti…² n_lab…³
   <chr>             <chr>        <chr>                    <int>   <int>   <int>
 1 1999              PRINCE       WARNER BROS                 89      89       2
 2 1999              PRINCE       WEA                         89      90       2
 3 21 GUNS           GREEN DAY    REPRISE                    115     116       2
 4 21 GUNS           GREEN DAY    WARNER BROS                115     117       2
 5 21ST CENTURY LIFE SAM SPARRO   UNIVERSAL                  124     127       2
 6 21ST CENTURY LIFE SAM SPARRO   ISLAND                     124     126       2
 7 22                LILY ALLEN   PARLOPHONE                 125     128       2
 8 22                LILY ALLEN   REGAL RECORDINGS           125     129       2
 9 4 IN THE MORNING  GWEN STEFANI INTERSCOPE                 175     179       2
10 4 IN THE MORNING  GWEN STEFANI POLYDOR                    175     180       2
# … with 774 more rows, and abbreviated variable names ¹id_title_artist,
#   ²id_title_artist_label, ³n_labels

Inserting consecutiveness

position_previous needs recoding

data_charts_with_lags <- data_charts_with_ids %>%
  arrange(date_week_start) %>%
  group_by(id_title_artist) %>%
  mutate(
    position_next = lead(position_current, order_by = date_week_start)
  ) %>% 
  ungroup() %>% 
  mutate(status = case_when(position_previous == "New" ~ "New release",
                            position_previous == "Re" ~ "Re-release",
                            !is.na(position_next) ~ "Consecutive week"),
         position_previous = as.numeric(position_previous))

Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

data_charts_with_rereleases <- data_charts_with_lags %>% 
  arrange(date_week_start, id_title_artist) %>% 
  group_by(id_title_artist) %>%
  arrange(date_week_start) %>% 
  mutate(check_rerelease = ifelse(status == "Re-release", 1, 0)) %>%
  mutate(check_rerelease = case_when(
    date_week_start == min(date_week_start) ~ 0, # handle re-release in first week of data
    status == "Re-release" ~ 1,
    TRUE ~ 0
  )) %>%
  mutate(nth_time_on_chart = cumsum(check_rerelease) + 1) %>%
  ungroup()

Look at re-releases

data_charts_with_rereleases %>% 
  select(date_week_start, title, starts_with("position"), nth_time_on_chart, everything()) %>% 
  filter(title == "SHAPE OF YOU")

# A tibble: 98 × 12
   date_wee…¹ title posit…² posit…³ posit…⁴ nth_t…⁵ artist label id_ti…⁶ id_ti…⁷
   <date>     <chr>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  <chr>   <int>   <int>
 1 2017-01-15 SHAP…       1      NA       1       1 ED SH… ASYL…   12527   12802
 2 2017-01-22 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 3 2017-01-29 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 4 2017-02-05 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 5 2017-02-12 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 6 2017-02-19 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 7 2017-02-26 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 8 2017-03-05 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
 9 2017-03-12 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
10 2017-03-19 SHAP…       1       1       1       1 ED SH… ASYL…   12527   12802
# … with 88 more rows, 2 more variables: status <chr>, check_rerelease <dbl>,
#   and abbreviated variable names ¹date_week_start, ²position_current,
#   ³position_previous, ⁴position_next, ⁵nth_time_on_chart, ⁶id_title_artist,
#   ⁷id_title_artist_label

data_charts_with_rereleases %>% 
  arrange(desc(nth_time_on_chart)) %>% 
  group_by(id_title_artist) %>% 
  filter(nth_time_on_chart == max(nth_time_on_chart)) %>% 
  ungroup() %>% 
  distinct(title, artist, nth_time_on_chart) %>% 
  rename(times_on_chart = nth_time_on_chart) %>% 
  slice(1:10) %>% 
  gt::gt()

title	artist	times_on_chart
MR BRIGHTSIDE	KILLERS	58
USE SOMEBODY	KINGS OF LEON	21
ONE DAY LIKE THIS	ELBOW	20
THRILLER	MICHAEL JACKSON	19
CHASING CARS	SNOW PATROL	18
FAIRYTALE OF NEW YORK	POGUES FT KIRSTY MACCOLL	18
I DON'T WANT TO MISS A THING	AEROSMITH	17
DON'T STOP ME NOW	QUEEN	17
MERRY XMAS EVERYBODY	SLADE	17
ALL I WANT FOR CHRISTMAS IS YOU	MARIAH CAREY	16

Finish up

data_3rd_millenium_charts <- data_charts_with_rereleases %>% 
  mutate(year = year(date_week_start)) %>% 
  group_by(id_title_artist) %>% 
  mutate(total_times_on_chart = max(nth_time_on_chart)) %>% 
  ungroup() %>% 
  select(-check_rerelease) %>% 
  group_by(id_title_artist) %>% 
  arrange(id_title_artist, date_week_start) %>% 
  mutate(nth_week_on_chart = n()) %>% 
  mutate(total_weeks_on_chart = max(nth_week_on_chart)) %>% 
  ungroup()

data_3rd_millenium_charts <- data_charts_with_rereleases %>% 
  mutate(year = year(date_week_start)) %>% 
  select(-check_rerelease)

Functions

generate_nth_time_in_charts <- function(data) {
  data %>%
    group_by(id_title_artist) %>%
    mutate(total_times_on_chart = max(nth_time_on_chart)) %>%
    ungroup() %>%
    group_by(id_title_artist) %>%
    arrange(id_title_artist, date_week_start) %>%
    mutate(nth_week_on_chart = row_number()) %>%
    mutate(total_weeks_on_chart = max(nth_week_on_chart)) %>%
    ungroup()
}

Top 100 Track Lines

By year

To start I’m going to look at just 2022 and work up from there

data_top10_2022 <- data_3rd_millenium_charts %>% 
  filter(
    # date_week_start > ymd("2022-01-01"),
    #      date_week_start < ymd("2023-01-01"),
         year == 2022,
         position_current <= 10) %>% 
  select(date_week_start, starts_with("position"), title, id_title_artist, everything()) %>%
  mutate(id_song_nth_time = paste(id_title_artist, nth_time_on_chart)) %>% 
  arrange(date_week_start, position_current)

data_top10_2022 %>% 
  select(id_song_nth_time, everything())

# A tibble: 520 × 13
   id_so…¹ date_wee…² posit…³ posit…⁴ posit…⁵ title id_ti…⁶ artist label id_ti…⁷
   <chr>   <date>       <dbl>   <dbl>   <dbl> <chr>   <int> <chr>  <chr>   <int>
 1 9354 1  2022-01-02       1       2      40 MERR…    9354 ED SH… ATLA…    9551
 2 8148 15 2022-01-02       2       3      71 LAST…    8148 WHAM   RCA      8316
 3 610 15  2022-01-02       3       4      56 ALL …     610 MARIA… COLU…     621
 4 4359 17 2022-01-02       4       7      57 FAIR…    4359 POGUE… WARN…    4445
 5 11973 … 2022-01-02       5       9      92 ROCK…   11973 BREND… MCA     12232
 6 9355 15 2022-01-02       6       8      54 MERR…    9355 SHAKI… RCA      9552
 7 7577 11 2022-01-02       7      11      49 IT'S…    7577 MICHA… REPR…    7735
 8 2808 1  2022-01-02       8      10      83 COME…    2808 GEORG… COLU…    2857
 9 7632 12 2022-01-02       9      15      82 IT'S…    7632 ANDY … SONY…    7792
10 3894 15 2022-01-02      10      12      76 DRIV…    3894 CHRIS… WARN…    3970
# … with 510 more rows, 3 more variables: status <chr>,
#   nth_time_on_chart <dbl>, year <dbl>, and abbreviated variable names
#   ¹id_song_nth_time, ²date_week_start, ³position_current, ⁴position_previous,
#   ⁵position_next, ⁶id_title_artist, ⁷id_title_artist_label

Square chart

data_top10_2022 %>% 
  generate_nth_time_in_charts() %>% 
  drop_na(position_next) %>% 
  identity() %>% 
  filter(position_next <= 10) %>% 
  select(date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>% 
  ggplot(aes(x = date_week_start,
                y = position_current,
                group = as.character(id_title_artist),
             colour = nth_week_on_chart
             # linewidth = nth_week_on_chart
             )) +
  # geom_point(size = 0.3) +
  # stat_smooth(se = FALSE) +
  geom_line() +
  # scale_colour_viridis_c() +
  scale_colour_gradient(low = "black",
                        high = "white") +
  scale_linewidth_continuous(breaks = c(4, 5, 6)) +
  guides(colour = guide_none()) +
  theme_void() +
  theme(panel.background = element_rect(fill = "black")) +
  NULL

data_3rd_millenium_charts %>% 
  filter(
    # date_week_start > ymd("2022-01-01"),
    #      date_week_start < ymd("2023-01-01"),
         year == 2022,
         between(position_current, 0, 20)
         ) %>% 
  select(date_week_start, starts_with("position"), title, id_title_artist, everything()) %>%
  mutate(id_song_nth_time = paste(id_title_artist, nth_time_on_chart)) %>% 
  arrange(date_week_start, position_current) %>% 
  generate_nth_time_in_charts() %>% 
  drop_na(position_next) %>% 
  identity() %>% 
  filter(position_next <= 20) %>%
  select(date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>% 
  ggplot(aes(x = date_week_start,
                y = position_current,
                group = as.character(id_title_artist),
             colour = nth_week_on_chart
             # linewidth = nth_week_on_chart
             )) +
  geom_point() +
  # stat_smooth(se = FALSE) +
  geom_line() +
  # scale_colour_viridis_c() +
  scale_colour_gradient(low = "black",
                        high = "white") +
  scale_linewidth_continuous(breaks = c(4, 5, 6)) +
  guides(colour = guide_none()) +
  theme_void() +
  theme(panel.background = element_rect(fill = "black")) +
  NULL

Polar

data_top10_2022 %>% 
  generate_nth_time_in_charts() %>% 
  drop_na(position_next) %>% 
  identity() %>% 
  filter(position_next <= 10) %>% 
  select(date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>% 
  ggplot(aes(x = date_week_start,
                y = position_current,
                group = as.character(id_title_artist),
             colour = nth_week_on_chart
             # linewidth = nth_week_on_chart
             )) +
  # geom_point() +
  # stat_smooth(se = FALSE) +
  geom_line() +
  # scale_colour_viridis_c() +
  scale_colour_gradient(low = "black",
                        high = "white") +
  scale_linewidth_continuous(breaks = c(4, 5, 6)) +
  coord_polar(start = 0) +
  ylim(0, NA) +
  # theme_void() +
  # theme(panel.background = element_rect(fill = "black")) +
  NULL

Instead of song look for runs at the same position

data_top10_2022 %>% 
  filter(position_current <= 5) %>% 
  arrange(date_week_start, position_current) %>% 
  select(date_week_start, starts_with("position"), title, id_title_artist) %>% 
  mutate(position_kept = ifelse(position_current == position_next, 1, 0)) %>%
  slice(1:40) %>% 
  group_by(position_current, position_kept) %>% 
  group_by(position_current) %>%
  arrange(date_week_start) %>%
  mutate(position_run = cumsum(position_kept))  %>%
  group_by(position_current, id_title_artist, position_kept) %>%
  mutate(position_run_id = cur_group_id()) %>% 
  ungroup() %>% 
  # View()
  ggplot(aes(x = date_week_start,
             y = position_current,
             colour = position_run_id,
             group = position_run_id)) +
  geom_point() +
  geom_line() +
  scale_colour_viridis_c()

data_top10_2022 %>% 
  filter(position_current <= 6) %>%
  arrange(date_week_start, position_current) %>% 
  select(date_week_start, starts_with("position"), title, id_title_artist) %>% 
  filter(position_current == position_next) %>% 
  group_by(position_current) %>% 
  arrange(date_week_start) %>% 
  mutate(consecutive_run = lead(date_week_start) == date_week_start + days(7)) %>% 
  mutate(run_length = cumsum(consecutive_run)) %>% 
  ungroup() %>% 
  group_by(position_current, id_title_artist, consecutive_run) %>% 
  mutate(position_run_id = cur_group_id()) %>% 
  ungroup() %>% 
  arrange(date_week_start, position_current) %>% 
  filter(position_current >= 4 & position_current <= 6) %>% 
  ggplot(aes(x = date_week_start,
             y = position_current,
             # colour = position_run_id,
             group = as.character(position_run_id))) +
  # geom_point() +
  geom_line() +
  scale_colour_viridis_c()

Reuse

https://creativecommons.org/licenses/by/4.0/

Citation

BibTeX citation:

@online{hadley2023,
  author = {Charlie Hadley},
  title = {2022-09-01 {Dataviz} - {Top} 10 {Sankey}},
  date = {2023-02-06},
  url = {https://visibledata.co.uk/posts/2023-02-06_dataviz_top-10-sankey},
  langid = {en}
}

For attribution, please cite this work as:

Charlie Hadley. 2023. “2022-09-01 Dataviz - Top 10 Sankey.” February 6, 2023. https://visibledata.co.uk/posts/2023-02-06_dataviz_top-10-sankey.