Looking for better way to visualise distribution i

2019-08-12 17:57发布

I'd like to visualise the following data: a hotel observes that each year some of its customers are repeat customers. So, each year about half of all customers are fist-time customers, 20% are 2nd time-customers, and so on. Below is some R code that includes the data and a visualisation. However, I'm not happy with it and I'm looking for improvements:

  • R doesn't like color bands with many colours - so maybe group data?
  • would a step curve be a better visualisation altogether?
  • The number of visits is treated as a factor - is this the right approach?

  • Stacking bars makes it easy to compare 1st-time guests but not the other ones. Should I pick a different visualisation?

    #! /usr/bin/env R CMD BATCH
    
    library(ggplot2)
    
    d <- read.table(header=TRUE, text='
        year visit count
        2013 1 1641
        2013 2 604
        2013 3 256
        2013 4 89
        2013 5 32
        2013 6 10
        2013 7 4
        2013 8 3
        2014 1 1365
        2014 2 637
        2014 3 276
        2014 4 154
        2014 5 86
        2014 6 39
        2014 7 19
        2014 8 6
        2014 9 4
        2014 10 2
        2014 11 1
        2014 12 1
        2015 1 1251
        2015 2 608
        2015 3 288
        2015 4 143
        2015 5 88
        2015 6 52
        2015 7 21
        2015 8 8
        2015 9 8
        2015 10 3
        2015 11 2
        2015 12 1')
    
    d$year  <- factor(d$year)
    d$visit <- factor(d$visit)
    
    p <- ggplot(d, aes(year,count))
    p <- p + geom_bar(aes(fill=visit),position="fill",stat="identity")
    p <- p + xlab("Year") + ylab("Distribution")
    # pdf("returners.pdf",9,6)
    print(p)
    # dev.off()
    

enter image description here

标签: r ggplot2
2条回答
ゆ 、 Hurt°
2楼-- · 2019-08-12 18:28

It seems that you're trying to compare the contributions to the total number of visits to the hotel by number of prior visits as well as do a year-to-year comparison. The following code puts this together in one chart.

d$year  <- factor(d$year)
# d$visit <- factor(d$visit)
d <- transform(d[order(d$year, d$visit),], cum_count=ave(count, year, FUN=cumsum))

x_max <- max(d$visit)
y_max <- max(d$cum_count)
color_sch <- c("red","tan","blue")

p <- ggplot(data=d, aes(x=visit,  colour=year))
p <- p + geom_bar(aes(y= count, fill=year), position="dodge",stat="identity", width=.7)
p <- p + geom_line(aes(y = cum_count), linetype="dotted", size=1)
p <- p + geom_point(aes(y = cum_count), size=4)
p <- p + scale_y_continuous(breaks = seq(0,y_max, 250))
p <- p + scale_x_continuous(breaks=1:x_max)
p <- p + scale_colour_manual(values=color_sch)
p <- p + scale_fill_manual(values=color_sch)
p <- p + xlab("Visit") + ylab("Count and \nCummulative Count")
p <- p + geom_text(aes(x = 2, y  = count[2], label = "Count by Number of Visits"), hjust=-.5, vjust=-2.0, size=6, color="Black") 
p <- p + geom_text(aes(x = x_max-5, y  = tail(cum_count,1), label = "Cummulative Count"), hjust=0, vjust=2.0, size=6, color="Black")
# pdf("returners.pdf",9,6)
print(p)
# dev.off()

which gives the chart

enter image description here

This representation suggests that the drop in 2015 as compared to previous years is due to fewer first time customers as opposed to a reduction in returning ones.

查看更多
兄弟一词,经得起流年.
3楼-- · 2019-08-12 18:45

Why not visualize them like actual distributions?

p <- ggplot(d, aes(visit, count))
p <- p + geom_bar(stat="identity", width=0.75)
p <- p + scale_x_discrete(expand=c(0,0))
p <- p + scale_y_continuous(expand=c(0,0))
p <- p + facet_wrap(~year)
p <- p + labs(x=NULL, y="Visits")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin.x=unit(1, "cm"))
p

enter image description here

To see the visit count deltas by year, you can just swap the facets:

d$year  <- factor(d$year)
d$visit <- sprintf("Visit: %d", d$visit)
d$visit <- factor(d$visit, levels=unique(d$visit))

p <- ggplot(d, aes(year, count))
p <- p + geom_segment(aes(xend=year, yend=0), size=0.3)
p <- p + geom_point()
p <- p + scale_x_discrete(expand=c(0, 0.25))
p <- p + scale_y_continuous(label=scales::comma)
p <- p + facet_wrap(~visit, scales="free_y")
p <- p + labs(x="NOTE: Free y-axis scale", y="Count")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin=unit(1.5, "cm"))
p

enter image description here

Or, you can look at YoY growth by visit (%):

library(dplyr)

group_by(d, visit) %>% 
  arrange(year) %>% 
  mutate(lag=lag(count),
         chg_pct=(count-lag)/lag,
         chg_pct=ifelse(is.na(chg_pct), 0, chg_pct),
         pos=as.character(sign(chg_pct))) -> d

p <- ggplot(d, aes(year, chg_pct))
p <- p + geom_hline(yintercept=0, color="#2b2b2b", size=0.5)
p <- p + geom_segment(aes(xend=year, yend=0, color=pos), size=0.3)
p <- p + geom_point(aes(color=pos))
p <- p + scale_x_discrete(expand=c(0, 0.25))
p <- p + scale_y_continuous(label=scales::percent)
p <- p + scale_color_manual(values=c("#b2182b", "#878787", "#7fbc41"))
p <- p + facet_wrap(~visit, scales="free_y")
p <- p + labs(x="NOTE: free y-axis", y="YoY % Difference per visit count")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin=unit(1.5, "cm"))
p <- p + theme(legend.position="none")
p

enter image description here

查看更多
登录 后发表回答