Plotting career path in graph

2019-08-16 01:42发布

问题:

I have the dataframe

test <- structure(list(
     y2002 = c("freshman","freshman","freshman","sophomore","sophomore","senior"),
     y2003 = c("freshman","junior","junior","sophomore","sophomore","senior"),
     y2004 = c("junior","sophomore","sophomore","senior","senior",NA),
     y2005 = c("senior","senior","senior",NA, NA, NA)), 
              .Names = c("2002","2003","2004","2005"),
              row.names = c(c(1:6)),
              class = "data.frame")
> test
       2002      2003      2004   2005
1  freshman  freshman    junior senior
2  freshman    junior sophomore senior
3  freshman    junior sophomore senior
4 sophomore sophomore    senior   <NA>
5 sophomore sophomore    senior   <NA>
6    senior    senior      <NA>   <NA>

and I want to create a graph that should resemble the ugly text art below:

freshman ---- junior ----------------------\

freshman ---- junior --- sophomore -------- senior

sophomore ================================/

senior ---------------------------------/

In other words, I need to show in a graph the possible paths to "senior", giving weights to edges according to the number of cases using that path.

First attempt This code generates a graph, but not one similar to the text art above.

library(igraph)
elist <- lapply(seq_len(nrow(test)), function(i) {
  x <- as.character(test[i,])
  x <- unique(na.omit(x))
  x <- rep(x, each=2)
  x <- x[-1]
  x <- x[-length(x)]
  r <- matrix(x, ncol=2, byrow=TRUE)
  if (nrow(r) > 0) { r <- cbind(r, i) } else { r <- cbind(r, numeric()) }
  r
})

result <- as.data.frame(do.call(rbind, elist))
names(result) <- c("vertex","edge", "id")
categories <- data.frame(name=c("freshman","junior","sophomore","senior"))
g <- graph.data.frame(result,directed=T,vertices=categories)
g <- set.edge.attribute(g, "weight", value=runif(ecount(g))*10)
igraph.par("plot.layout", layout.reingold.tilford)
plot(g, vertex.label=categories$name, vertex.label.dist=7, 
     edge.width=get.edge.attribute(g,"weight"), edge.arrow.size=1.5)

Result (not what I wanted)

**This question is related to this post*

**And this post is a necessary step for solving this question*

回答1:

I don't think you understand exactly what your up to, in your example sketch of the graph you have 8 nodes. But in the figure you produced you only have 4 nodes. This is because in the graph you create you only have 4 nodes. igraph will treat nodes with the same name (e.g. two sophomores as the same node but with two edges)

However, once you have made the graph with multiple sophomore etc nodes. e.g. sophomore1 and sophomore2. You can then place the nodes where you want them with layout as below

 df<- read.table(text="vertex edge weight
 freshman junior 2
 junior    senior 2
 freshman2 junior2 2
 junior2 sophomore 2
 sophomore senior 2
 sophomore2 senior 3
 senior2   senior 2",header=TRUE)
 categories <-data.frame(name=c("freshman","junior","sophomore","senior","freshman2",
 "junior2","sophomore2","senior2"))

g <- graph.data.frame(df,directed=T,vertices=categories)
layOUT<-data.frame(x=c(1,2,3,4,1,2,1,1),y=c(4,4,3,3,3,3,2,1))
l<-as.matrix(layOUT)
plot(g,layout=l)



回答2:

This is a completely worked out solution. One has to read the graph bottom up, bearing in mind that each individual is represented as a vertical line that represents their career path. Yes, I did ditch iGraph for this task. \o/

require(reshape2)

meltpath <- function(x){
  require(data.table)
  x <- melt(data = x, id.vars = 'id', measure.vars = names(x)[-1])
  names(x) <- c('id','year','category')
  x$year <- factor(x$year)
  id <- unique(x$id)
  idtable <- data.table(id = id, count = 1:length(id))
  x <- x[order(x$id), ]
  x <- merge(x, idtable, by='id')
  return(x)
}

carpath <- function(datatable, max_x = max(datatable$count)){
  require(ggplot2)
  p = ggplot(datatable, aes(x = count, y = year, fill = category)) + 
    geom_tile() +
    scale_y_discrete(name = "year\n", 
                     breaks = rev(levels(datatable$year))) + 
    scale_x_continuous(name = "cumulative count", 
                      limits = c(0,max_x)) +
    guides(fill = guide_legend(title="Career stage\n",
                               reverse=TRUE)) +
    theme(panel.grid.major = element_blank(), 
         panel.background = element_blank(), 
         axis.ticks = element_blank(),
         plot.title = element_text(vjust = 1.2, face="bold", size=20),
         axis.title.y = element_text(size=15, face="bold"),
         axis.text.y = element_text(size=15, colour="black"),
         legend.title = element_text(size = 15),
         legend.text = element_text(size = 15)) +
         scale_fill_brewer(palette = "Dark2") +
    ggtitle("Career path of individual Students by year")
  p
}

test <- structure(list(
  id = 1:6,
  y2002 = c("freshman","freshman","freshman","sophomore","sophomore","senior"),
  y2003 = c("freshman","junior","sophomore","sophomore","sophomore","senior"),
  y2004 = c("junior","sophomore","sophomore","senior","senior",NA),
  y2005 = c("senior","senior","senior",NA, NA, NA)), 
                  .Names = c("id","2002","2003","2004","2005"),
                  row.names = c(c(1:6)),
                  class = "data.frame")
# Grow dataset
testg = data.frame()
for (i in rownames(test)) {
  test0 <- test[rep(i, each=abs(floor(rnorm(1)*100))),]
  testg <- rbind(testg, test0)
}
testg$id <- 1:nrow(testg)
# Munge
test0 <- testg
test1 <- melt(data = test0, id.vars = 'id', measure.vars = names(test0)[-1])
names(test1) <- c('id','year','category')
test1$category[test1$category == 'freshman'] <- 1
test1$category[test1$category == 'junior'] <- 2
test1$category[test1$category == 'sophomore'] <- 3
test1$category[test1$category == 'senior'] <- 4
test1$category <- factor(test1$category, levels=1:4, labels = c('1. freshman','2. junior','3. sophomore','4. senior'))
test1 <- test1[order(test1$category), ]
test1 <- dcast(test1, id ~ year)
test1 <- test1[order(test1$'2005',test1$'2004',test1$'2003',test1$'2002'), ]
test2 <- meltpath(test1)
carpath(test2)