I have the following data (small part of it) named "short2_pre_snp_tumor.txt"
rs987435 C G 1 1 1 0 2
rs345783 C G 0 0 1 0 0
rs955894 G T 1 1 2 2 1
rs6088791 A G 1 2 0 0 1
rs11180435 C T 1 0 1 1 1
rs17571465 A T 1 2 2 2 2
rs17011450 C T 2 2 2 2 2
rs6919430 A C 2 1 2 2 2
rs2342723 C T 0 2 0 0 0
rs11992567 C T 2 2 2 2 2
and I need to get the PED and MAP file using Python, as R is really slow in case of large dataset.
I have the following code in R:
tm <- proc.time()
d<-read.table("short2_pre_snp_tumor.txt")
n<-nrow(d) #237196
nrs<-ncol(d)-3 #1116
dd<- data.frame(matrix(NA, nrow= ncol(d)-3, ncol=2*nrow(d)), stringsAsFactors=TRUE)
for (j in 1:nrs) {
for (i in 1:n) {
if (d[i, j+3]==0) {
dd[j, 2*i-1]<-as.character(d[i,2])
dd[j, 2*i]<-as.character(d[i,2])
} else if (d[i, j+3]==1) {
dd[j, 2*i-1]<-as.character(d[i,2])
dd[j, 2*i]<-as.character(d[i,3])
} else if (d[i, j+3]==2) {
dd[j, 2*i-1]<-as.character(d[i,3])
dd[j, 2*i]<-as.character(d[i,3])
}
}
}
ped6front<-data.frame(FID = 1: nrow(dd), IID= 1: nrow(dd), PID=0, MID=0, SEX= sample(1:2, nrow(dd), replace=T), PHENOTYPE=2)
BRCA_tumorfromR.ped <- cbind(ped6front,dd)
write.table(BRCA_tumorfromR.ped, “BRCA_tumor.ped”, append=FALSE, quote=FALSE, col.names=FALSE)
proc.time() #ptm