Dataframe processing

2019-01-20 17:23发布

问题:

I have a dataframe, which I read by Match <- read.table("Match.txt", sep="", fill =T, stringsAsFactors = FALSE, quote = "", header = F) and looks like this:

> ab
           V1       V2  V3                       V4 V5    V6 V7    V8 V9               V10
1  Inspecting sequence  ID chr1:173244300-173244500       NA       NA                     
2   V$ATF3_Q6        |  19                      (-)  | 0.877  | 0.622  |    aagtccCATCAggg
3   V$ATF3_Q6        |  34                      (-)  | 0.788  | 0.655  |    agggaaCGACAcag
4   V$ATF3_Q6        | 102                      (+)  | 0.738  | 0.685  |    cccTGAGCttagga
5  V$CEBPB_01        |  24                      (+)  | 0.950  | 0.882  |    ccatcagGGAAGgg
72   V$YY1_01        | 117                      (+)  | 0.996  | 0.984  | acttCCCATcttttaag
73 Inspecting sequence  ID chr1:173244350-173244550       NA       NA                     
74  V$ATF3_Q6        |  52                      (+)  | 0.738  | 0.685  |    cccTGAGCttagga
75  V$ATF3_Q6        | 160                      (+)  | 0.862  | 0.687  |    gtcTGACCtggaga
76 V$CEBPB_01        |  57                      (+)  | 0.966  | 0.958  |    agcttagGAAACtt

It contains million of such repetition, where first line is: Inspecting sequence ID chr1:173244300-173244500 and then some value as can be seen above. I want to process it keeping following things in mind:

  1. Extract the first line, break it on : and - so I will get three columns like: chr1 173244300 173244500
  2. The 4th column should contain the V1$Row2 1st element, splitted on $ and _ and just take the 2nd index which will be ATF3, like this I have 30 definite (lets call them names) cases, some will be observed while others not in each case (1 case is from Row 1 to row 72, second start from row 73).
  3. If that name appears in 1 case then value B will be assigned to that column, if not value U will be assigned

So based on my input, I want to get the following output:

chr     start       stop        ATF3  CEBPB  YY1    ..(All which appear e.g from row 1 to 72, ignoring duplicates)
chr1    173244300   173244500   B     B      B  
chr1    173244350   173244550   B     B      U

I want a fix no.of column in the header (I know they are 32 such names) so if they appear in one case B will be assigned, otherwise U will be assigned.

If anybody can help me in doing this, it will be a great help.

Here is the dput of this sample dataframe:

> ab <- dput(Match[c(1:5,72:76), ])
structure(list(V1 = c("Inspecting", "V$ATF3_Q6", "V$ATF3_Q6", 
"V$ATF3_Q6", "V$CEBPB_01", "V$YY1_01", "Inspecting", "V$ATF3_Q6", 
"V$ATF3_Q6", "V$CEBPB_01"), V2 = c("sequence", "|", "|", "|", 
"|", "|", "sequence", "|", "|", "|"), V3 = c("ID", "19", "34", 
"102", "24", "117", "ID", "52", "160", "57"), V4 = c("chr1:173244300-173244500", 
"(-)", "(-)", "(+)", "(+)", "(+)", "chr1:173244350-173244550", 
"(+)", "(+)", "(+)"), V5 = c("", "|", "|", "|", "|", "|", "", 
"|", "|", "|"), V6 = c(NA, 0.877, 0.788, 0.738, 0.95, 0.996, 
NA, 0.738, 0.862, 0.966), V7 = c("", "|", "|", "|", "|", "|", 
"", "|", "|", "|"), V8 = c(NA, 0.622, 0.655, 0.685, 0.882, 0.984, 
NA, 0.685, 0.687, 0.958), V9 = c("", "|", "|", "|", "|", "|", 
"", "|", "|", "|"), V10 = c("", "aagtccCATCAggg", "agggaaCGACAcag", 
"cccTGAGCttagga", "ccatcagGGAAGgg", "acttCCCATcttttaag", "", 
"cccTGAGCttagga", "gtcTGACCtggaga", "agcttagGAAACtt")), .Names = c("V1", 
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10"), row.names = c(1L, 
2L, 3L, 4L, 5L, 72L, 73L, 74L, 75L, 76L), class = "data.frame")

回答1:

given your input file in this question as /c/tmp.txt

And this awk script saved as SO-38563400.awk:

BEGIN {
 OFS="\t" # Set the output separator
 i=0 # Just to init the counter and be sure to start at 1 later
}
 {
 #print $0
 }
/Inspecting sequence ID/ { # Changing sequence, initialize new entry with start and end
  split($4,arr,"[:-]") # split the string in fields, split on : and -
  seq[i++,"chr"]=arr[1] # Save the chr part and increase the sequence beforehand
  seq[i,"start"]=arr[2] # save the start date
  seq[i,"end"]=arr[3] # Save the end date
}

/V[$][^_]+_.*/ { # V line type,
  split($1,arr,"[$_]") # Split on $ and underscore
  seq[i,arr[2]]="B" # This has been seen, setting to B
  seq[i,"print"]=1
  names[arr[2]]++ # Save the name for output
  # (and count occurences, just for fun, well mainly because an int is cheaper to store)
  # Main reason is it allow a quicker access toa rray keys ant END block
}

END {
  head=sprintf("char%sstart%sstop",OFS,OFS,OFS)
  for (h in names) {
    head=sprintf("%s%s%s",head,OFS,h)
  }
  print(head)
  for (l=1; l<i; l++) { # loop over each line/sequence
    line=sprintf("%s%s%s%s%s",seq[l,"chr"],OFS,seq[l,"start"],OFS,seq[l,"end"])
    for (h in names) {
      if (seq[l,h]=="B") line=sprintf("%s%s%s",line,OFS,"B")
      else line=sprintf("%s%s%s",line,OFS,"U")
    }
    if (seq[l,"print"]) print line
  }
}

Passing this command:

awk -f SO-38563400.awk /c/tmp.txt > /c/Rtable.txt

Gives:

$ cat /c/Rtable.txt
char    start   stop    STAT3   ATF3    TEAD4   GATA3   JUND    HNF4A   FOXA2   MAX     CEBPB   SPI1    GABPA   CMYC    P300    E2F1    CTCF    ATF2
chr22   16049850        16050050        B       B       U       B       U       B       B       U       U       U       U       U       B       B       U       B
chr22   16049900        16050100        B       B       B       B       B       B       B       B       B       B       B       B       B       B       B       B

And then reading in r:

> x <- read.table("/c/Rtable.txt", sep="\t",  stringsAsFactors = FALSE, header=T)
> x
char    start     stop STAT3 ATF3 TEAD4 GATA3 JUND HNF4A FOXA2 MAX CEBPB SPI1 GABPA CMYC P300 E2F1 CTCF ATF2
1 chr22 16049850 16050050     B    B     U     B    U     B     B   U     U    U     U    U    B    B    U    B
2 chr22 16049900 16050100     B    B     B     B    B     B     B   B     B    B     B    B    B    B    B    B

Please disregard the setup with /c/ paths, this could work on windows or linux, there's port of awk under windows, I suggest using linux for large files due to the operating system capacities on file streaming.

We can save far more memory by not reading the whole file before printing results, but this need a fixed set of "names" but you've been too lazy to extract the names by yourself and just sent me a bunch of entries, exercice is left to you to adapt, make the list in BEGIN block, use it as entries for each seq, and on each new seq print the previous result before processing.

I hope next time you'll take some time to bake a proper question and that you'll understand you have to make some efforts for others to help you, specially after a flow of comments asking you to improve your question.



回答2:

Maybe not the best use of stringr or tidyr, but this can be done in the hadleyverse in a somewhat readable manner...

The logic flow is:

  • Determine the group by using tidyr::fill with ifelse("Inspecting", rowname, NA).
  • Mutate the fields to what you wanted
  • Use reshape (dcast) to get the format that you want.

library(dplyr)
library(tidyr)
library(reshape2)
library(stringr)

is_in <- function(v1part) {
  return(ifelse(length(v1part) > 0, "B", "U"))
}

ab1<- ab %>% 
  add_rownames() %>%
  mutate(rowname = ifelse(V1=="Inspecting", rowname, NA),
         V4a = ifelse(V4 == "(-)" | V4 == "(+)", NA, V4),

         chr = str_extract_all(ab$V4, "^chr[^:]+", simplify = T)[,1],
         chr = ifelse(chr=="", NA, chr),

         start = str_split_fixed(V4a, ":|-", 3)[,2],
         start = ifelse(start=="", NA, start), 

         stop = str_split_fixed(V4a, ":|-", 3)[,3],
         stop = ifelse(stop=="", NA, stop),

         V1part = str_split_fixed(V1, "\\$|_", 3)[,2]) %>%
  fill(rowname, .direction="down") %>% 
  group_by(rowname) %>%
  fill(chr, .direction="down") %>%
  fill(start, .direction="down") %>%
  fill(stop, .direction="down") %>%
  dcast(chr+start+stop ~ V1part, fun.aggregate=is_in)

> ab1
   chr     start      stop Var.4 ATF3 CEBPB YY1
1 chr1 173244300 173244500     B    B     B   B
2 chr1 173244350 173244550     B    B     B   U


回答3:

Not elegant, but it should work (Your data has a column with "|"... I named it df):

cond <- which(!df$V2 == "|")
new_df <- data.frame(chr=character(length(cond)), start=character(length(cond)), stop=character(length(cond)))

for (i in 1:length(cond)) {
  line <- df[cond[i], ]
  var <- unlist(strsplit(line$V4, split = ":"))
  var2 <- unlist(strsplit(var[2], split = "-"))
  new_df$chr[i] <- var[1]
  new_df$start[i] <- var2[1]
  new_df$stop[i] <- var2[2]
  for (k in (i+1):(cond[i+1]-1)) {
    # Your code using name <- df$V1 (Use strsplit again)
    # df[i, name] <- ...
  }
}