Obtain data
This blogpost walks through how to obtain data about albums https://medium.com/@caineosborne/analysing-uk-chart-history-1956-to-2017-6fec0ecc991b
This gives the Top 20 albums on 1961-07-02
http://www.officialcharts.com/charts/albums-chart/19610702/7502/
This gives the Top 100 singles for 1999-12-26 to 2000-01-01
https://www.officialcharts.com/charts/singles-chart/19991226/7501/
Let’s generate the dates from then until now
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0 ✔ purrr 1.0.1
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.3 ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
Loading required package: future
dates_charts <- seq (ymd ("1999-12-26" ), ymd ("2023-01-01" ), "7 days" )
This function will extract the data:
get_week_top100_data <- function (week_start) {
raw_date = str_remove_all (week_start, "-" )
page_results <-
str_glue ("https://www.officialcharts.com/charts/singles-chart/{raw_date}/7501/" ) %>%
read_html ()
page_results %>%
html_table () %>%
.[[1 ]] %>%
clean_names () %>%
filter (pos %in% as.character (1 : 100 )) %>%
select (1 : 3 ) %>%
separate (title_artist,
sep = " \r\n \r\n \r\n\r\n " ,
into = c ("title" , "artist_label" )) %>%
separate (artist_label,
sep = " \r\n \r\n " ,
into = c ("artist" , "label" )) %>%
set_names (c (
"position_current" ,
"position_previous" ,
"title" ,
"artist" ,
"label"
)) %>%
mutate (date_week_start = week_start) %>%
relocate (date_week_start)
}
dates_charts[1 ] %>%
get_week_top100_data ()
# A tibble: 100 × 6
date_week_start position_current position_previous title artist label
<date> <chr> <chr> <chr> <chr> <chr>
1 1999-12-26 1 1 I HAVE A DRE… WESTL… RCA
2 1999-12-26 2 2 THE MILLENNI… CLIFF… PAPI…
3 1999-12-26 3 3 IMAGINE JOHN … PARL…
4 1999-12-26 4 8 MR. HANKEY T… MR HA… COLU…
5 1999-12-26 5 6 RE-REWIND TH… ARTFU… RELE…
6 1999-12-26 6 5 TWO IN A MIL… S CLU… POLY…
7 1999-12-26 7 4 COGNOSCENTI … CUBAN… EMI
8 1999-12-26 8 7 SAY YOU'LL B… STEPS JIVE
9 1999-12-26 9 10 KISS (WHEN T… VENGA… POSI…
10 1999-12-26 10 11 BACK IN MY L… ALICE… POSI…
# … with 90 more rows
Now let’s map this
plan (multisession)
data_raw_all_charts <- dates_charts %>%
future_map_dfr (get_week_top100_data)
data_raw_all_charts %>%
write_csv (here:: here ("posts" , "2023-02-06_dataviz_top-10-sankey" , "data_raw_all_charts.csv" ))
Explore and tidy up
data_raw_all_charts <- read_csv (here:: here ("posts" , "2023-02-06_dataviz_top-10-sankey" , "data_raw_all_charts.csv" ))
Rows: 119900 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): position_previous, title, artist, label
dbl (1): position_current
date (1): date_week_start
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
There are ZERO NA
values!!!
data_charts_with_ids <- data_raw_all_charts %>%
group_by (title, artist) %>%
mutate (id_title_artist = cur_group_id ()) %>%
group_by (title, artist, label) %>%
mutate (id_title_artist_label = cur_group_id ()) %>%
ungroup ()
There are 399 more unique songs when splitting the data by Title, Artist and Label.
After looking at these everything feel okay. I can go with title and artist ids.
data_charts_with_ids %>%
group_by (title, artist) %>%
mutate (n_labels = n_distinct (label)) %>%
ungroup () %>%
filter (n_labels > 1 ) %>%
select (title, artist, label, contains ("id" ), n_labels) %>%
unique () %>%
arrange (title, artist)
# A tibble: 784 × 6
title artist label id_title_ar…¹ id_ti…² n_lab…³
<chr> <chr> <chr> <int> <int> <int>
1 1999 PRINCE WARNER BROS 89 89 2
2 1999 PRINCE WEA 89 90 2
3 21 GUNS GREEN DAY REPRISE 115 116 2
4 21 GUNS GREEN DAY WARNER BROS 115 117 2
5 21ST CENTURY LIFE SAM SPARRO UNIVERSAL 124 127 2
6 21ST CENTURY LIFE SAM SPARRO ISLAND 124 126 2
7 22 LILY ALLEN PARLOPHONE 125 128 2
8 22 LILY ALLEN REGAL RECORDINGS 125 129 2
9 4 IN THE MORNING GWEN STEFANI INTERSCOPE 175 179 2
10 4 IN THE MORNING GWEN STEFANI POLYDOR 175 180 2
# … with 774 more rows, and abbreviated variable names ¹id_title_artist,
# ²id_title_artist_label, ³n_labels
Inserting consecutiveness
position_previous needs recoding
data_charts_with_lags <- data_charts_with_ids %>%
arrange (date_week_start) %>%
group_by (id_title_artist) %>%
mutate (
position_next = lead (position_current, order_by = date_week_start)
) %>%
ungroup () %>%
mutate (status = case_when (position_previous == "New" ~ "New release" ,
position_previous == "Re" ~ "Re-release" ,
! is.na (position_next) ~ "Consecutive week" ),
position_previous = as.numeric (position_previous))
Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
data_charts_with_rereleases <- data_charts_with_lags %>%
arrange (date_week_start, id_title_artist) %>%
group_by (id_title_artist) %>%
arrange (date_week_start) %>%
mutate (check_rerelease = ifelse (status == "Re-release" , 1 , 0 )) %>%
mutate (check_rerelease = case_when (
date_week_start == min (date_week_start) ~ 0 , # handle re-release in first week of data
status == "Re-release" ~ 1 ,
TRUE ~ 0
)) %>%
mutate (nth_time_on_chart = cumsum (check_rerelease) + 1 ) %>%
ungroup ()
Look at re-releases
data_charts_with_rereleases %>%
select (date_week_start, title, starts_with ("position" ), nth_time_on_chart, everything ()) %>%
filter (title == "SHAPE OF YOU" )
# A tibble: 98 × 12
date_wee…¹ title posit…² posit…³ posit…⁴ nth_t…⁵ artist label id_ti…⁶ id_ti…⁷
<date> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <int> <int>
1 2017-01-15 SHAP… 1 NA 1 1 ED SH… ASYL… 12527 12802
2 2017-01-22 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
3 2017-01-29 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
4 2017-02-05 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
5 2017-02-12 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
6 2017-02-19 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
7 2017-02-26 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
8 2017-03-05 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
9 2017-03-12 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
10 2017-03-19 SHAP… 1 1 1 1 ED SH… ASYL… 12527 12802
# … with 88 more rows, 2 more variables: status <chr>, check_rerelease <dbl>,
# and abbreviated variable names ¹date_week_start, ²position_current,
# ³position_previous, ⁴position_next, ⁵nth_time_on_chart, ⁶id_title_artist,
# ⁷id_title_artist_label
data_charts_with_rereleases %>%
arrange (desc (nth_time_on_chart)) %>%
group_by (id_title_artist) %>%
filter (nth_time_on_chart == max (nth_time_on_chart)) %>%
ungroup () %>%
distinct (title, artist, nth_time_on_chart) %>%
rename (times_on_chart = nth_time_on_chart) %>%
slice (1 : 10 ) %>%
gt:: gt ()
title
artist
times_on_chart
MR BRIGHTSIDE
KILLERS
58
USE SOMEBODY
KINGS OF LEON
21
ONE DAY LIKE THIS
ELBOW
20
THRILLER
MICHAEL JACKSON
19
CHASING CARS
SNOW PATROL
18
FAIRYTALE OF NEW YORK
POGUES FT KIRSTY MACCOLL
18
I DON'T WANT TO MISS A THING
AEROSMITH
17
DON'T STOP ME NOW
QUEEN
17
MERRY XMAS EVERYBODY
SLADE
17
ALL I WANT FOR CHRISTMAS IS YOU
MARIAH CAREY
16
Finish up
data_3rd_millenium_charts <- data_charts_with_rereleases %>%
mutate (year = year (date_week_start)) %>%
group_by (id_title_artist) %>%
mutate (total_times_on_chart = max (nth_time_on_chart)) %>%
ungroup () %>%
select (- check_rerelease) %>%
group_by (id_title_artist) %>%
arrange (id_title_artist, date_week_start) %>%
mutate (nth_week_on_chart = n ()) %>%
mutate (total_weeks_on_chart = max (nth_week_on_chart)) %>%
ungroup ()
data_3rd_millenium_charts <- data_charts_with_rereleases %>%
mutate (year = year (date_week_start)) %>%
select (- check_rerelease)
Functions
generate_nth_time_in_charts <- function (data) {
data %>%
group_by (id_title_artist) %>%
mutate (total_times_on_chart = max (nth_time_on_chart)) %>%
ungroup () %>%
group_by (id_title_artist) %>%
arrange (id_title_artist, date_week_start) %>%
mutate (nth_week_on_chart = row_number ()) %>%
mutate (total_weeks_on_chart = max (nth_week_on_chart)) %>%
ungroup ()
}
Top 100 Track Lines
By year
To start I’m going to look at just 2022 and work up from there
data_top10_2022 <- data_3rd_millenium_charts %>%
filter (
# date_week_start > ymd("2022-01-01"),
# date_week_start < ymd("2023-01-01"),
year == 2022 ,
position_current <= 10 ) %>%
select (date_week_start, starts_with ("position" ), title, id_title_artist, everything ()) %>%
mutate (id_song_nth_time = paste (id_title_artist, nth_time_on_chart)) %>%
arrange (date_week_start, position_current)
data_top10_2022 %>%
select (id_song_nth_time, everything ())
# A tibble: 520 × 13
id_so…¹ date_wee…² posit…³ posit…⁴ posit…⁵ title id_ti…⁶ artist label id_ti…⁷
<chr> <date> <dbl> <dbl> <dbl> <chr> <int> <chr> <chr> <int>
1 9354 1 2022-01-02 1 2 40 MERR… 9354 ED SH… ATLA… 9551
2 8148 15 2022-01-02 2 3 71 LAST… 8148 WHAM RCA 8316
3 610 15 2022-01-02 3 4 56 ALL … 610 MARIA… COLU… 621
4 4359 17 2022-01-02 4 7 57 FAIR… 4359 POGUE… WARN… 4445
5 11973 … 2022-01-02 5 9 92 ROCK… 11973 BREND… MCA 12232
6 9355 15 2022-01-02 6 8 54 MERR… 9355 SHAKI… RCA 9552
7 7577 11 2022-01-02 7 11 49 IT'S… 7577 MICHA… REPR… 7735
8 2808 1 2022-01-02 8 10 83 COME… 2808 GEORG… COLU… 2857
9 7632 12 2022-01-02 9 15 82 IT'S… 7632 ANDY … SONY… 7792
10 3894 15 2022-01-02 10 12 76 DRIV… 3894 CHRIS… WARN… 3970
# … with 510 more rows, 3 more variables: status <chr>,
# nth_time_on_chart <dbl>, year <dbl>, and abbreviated variable names
# ¹id_song_nth_time, ²date_week_start, ³position_current, ⁴position_previous,
# ⁵position_next, ⁶id_title_artist, ⁷id_title_artist_label
Square chart
data_top10_2022 %>%
generate_nth_time_in_charts () %>%
drop_na (position_next) %>%
identity () %>%
filter (position_next <= 10 ) %>%
select (date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>%
ggplot (aes (x = date_week_start,
y = position_current,
group = as.character (id_title_artist),
colour = nth_week_on_chart
# linewidth = nth_week_on_chart
)) +
# geom_point(size = 0.3) +
# stat_smooth(se = FALSE) +
geom_line () +
# scale_colour_viridis_c() +
scale_colour_gradient (low = "black" ,
high = "white" ) +
scale_linewidth_continuous (breaks = c (4 , 5 , 6 )) +
guides (colour = guide_none ()) +
theme_void () +
theme (panel.background = element_rect (fill = "black" )) +
NULL
data_3rd_millenium_charts %>%
filter (
# date_week_start > ymd("2022-01-01"),
# date_week_start < ymd("2023-01-01"),
year == 2022 ,
between (position_current, 0 , 20 )
) %>%
select (date_week_start, starts_with ("position" ), title, id_title_artist, everything ()) %>%
mutate (id_song_nth_time = paste (id_title_artist, nth_time_on_chart)) %>%
arrange (date_week_start, position_current) %>%
generate_nth_time_in_charts () %>%
drop_na (position_next) %>%
identity () %>%
filter (position_next <= 20 ) %>%
select (date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>%
ggplot (aes (x = date_week_start,
y = position_current,
group = as.character (id_title_artist),
colour = nth_week_on_chart
# linewidth = nth_week_on_chart
)) +
geom_point () +
# stat_smooth(se = FALSE) +
geom_line () +
# scale_colour_viridis_c() +
scale_colour_gradient (low = "black" ,
high = "white" ) +
scale_linewidth_continuous (breaks = c (4 , 5 , 6 )) +
guides (colour = guide_none ()) +
theme_void () +
theme (panel.background = element_rect (fill = "black" )) +
NULL
Polar
data_top10_2022 %>%
generate_nth_time_in_charts () %>%
drop_na (position_next) %>%
identity () %>%
filter (position_next <= 10 ) %>%
select (date_week_start, position_current, title, id_title_artist, nth_week_on_chart) %>%
ggplot (aes (x = date_week_start,
y = position_current,
group = as.character (id_title_artist),
colour = nth_week_on_chart
# linewidth = nth_week_on_chart
)) +
# geom_point() +
# stat_smooth(se = FALSE) +
geom_line () +
# scale_colour_viridis_c() +
scale_colour_gradient (low = "black" ,
high = "white" ) +
scale_linewidth_continuous (breaks = c (4 , 5 , 6 )) +
coord_polar (start = 0 ) +
ylim (0 , NA ) +
# theme_void() +
# theme(panel.background = element_rect(fill = "black")) +
NULL
Instead of song look for runs at the same position
data_top10_2022 %>%
filter (position_current <= 5 ) %>%
arrange (date_week_start, position_current) %>%
select (date_week_start, starts_with ("position" ), title, id_title_artist) %>%
mutate (position_kept = ifelse (position_current == position_next, 1 , 0 )) %>%
slice (1 : 40 ) %>%
group_by (position_current, position_kept) %>%
group_by (position_current) %>%
arrange (date_week_start) %>%
mutate (position_run = cumsum (position_kept)) %>%
group_by (position_current, id_title_artist, position_kept) %>%
mutate (position_run_id = cur_group_id ()) %>%
ungroup () %>%
# View()
ggplot (aes (x = date_week_start,
y = position_current,
colour = position_run_id,
group = position_run_id)) +
geom_point () +
geom_line () +
scale_colour_viridis_c ()
data_top10_2022 %>%
filter (position_current <= 6 ) %>%
arrange (date_week_start, position_current) %>%
select (date_week_start, starts_with ("position" ), title, id_title_artist) %>%
filter (position_current == position_next) %>%
group_by (position_current) %>%
arrange (date_week_start) %>%
mutate (consecutive_run = lead (date_week_start) == date_week_start + days (7 )) %>%
mutate (run_length = cumsum (consecutive_run)) %>%
ungroup () %>%
group_by (position_current, id_title_artist, consecutive_run) %>%
mutate (position_run_id = cur_group_id ()) %>%
ungroup () %>%
arrange (date_week_start, position_current) %>%
filter (position_current >= 4 & position_current <= 6 ) %>%
ggplot (aes (x = date_week_start,
y = position_current,
# colour = position_run_id,
group = as.character (position_run_id))) +
# geom_point() +
geom_line () +
scale_colour_viridis_c ()
Citation BibTeX citation:
@online{hadley2023,
author = {Charlie Hadley},
title = {2022-09-01 {Dataviz} - {Top} 10 {Sankey}},
date = {2023-02-06},
url = {https://visibledata.co.uk/posts/2023-02-06_dataviz_top-10-sankey},
langid = {en}
}
For attribution, please cite this work as:
Charlie Hadley. 2023.
“2022-09-01 Dataviz - Top 10 Sankey.”
February 6, 2023.
https://visibledata.co.uk/posts/2023-02-06_dataviz_top-10-sankey .