I have a fairly large dataset saved in a .txt
file, that i read into my environment using fread()
from data.table
. I would like to organize the data so that it looks like this:
# Desired output: Column headers = [ Date, Company Names, RET, RETX,PRC, vwretd, ewrtd, sprtrn]
# rows contain stock return data... (I made up the values below to save time, but the format must remain).
Date AMERICAN CAR & FDRY CO ALASKA JUNEAU GOLD MNG C RET RETX vwretd ...
1925-12-31 0.0432 0.0231 0.0032 0.053 ...
1926-01-02 0.0210 0.0133 0.0124 0.003 ...
1926-01-04 0.0324 0.0131 0.0134 0.023 ...
... ... ... ... ... ...
The only problem is that I am having trouble because my crsp$date
column has duplicate Dates
(SEE REPRODUCIBLE DATA), as the data seems to be row binded
Here is what I run when I import data using fread()
crsp <- fread("/Volumes/share/CRSP/CRSP_RETS/1925-1930.txt", sep="\t", header=TRUE)
crsp <- as.data.frame(crsp)
After import I run the following to format date
column
crsp$DATE <- as.Date(as.character(crsp$date),format="%Y%m%d")
I then try to convert to XTS
data <- xts(crsp[,3:11], crsp$DATE)
data
returns the following: [ dput(data)
]
data <- structure(c(NA, NA, NA, NA, NA, NA, "AMERICAN CAR & FDRY CO",
"AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO", "ALASKA JUNEAU GOLD MNG CO",
"AMERICAN CAR & FDRY CO", "ALASKA JUNEAU GOLD MNG CO", "A", "A",
"A", "A", "A", "A", "109.0000", "109.1250", "111.0000", " 1.8750",
"110.5000", " 1.7500", "C", "0.001147", "0.017182", "C", "-0.004505",
"-0.066667", "C", "0.001147", "0.017182", "C", "-0.004505", "-0.066667",
NA, " 0.005893", " 0.001277", " 0.001277", "-0.003984", "-0.003984",
NA, " 0.009516", " 0.005780", " 0.005780", "-0.001927", "-0.001927",
NA, NA, NA, NA, NA, NA), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC",
class = c("xts",
"zoo"), index = structure(c(-1388620800, -1388448000, -1388275200,
-1388275200, -1388188800, -1388188800), tzone = "UTC", tclass = "Date"), .Dim = c(6L,
9L), .Dimnames = list(NULL, c("TICKER", "COMNAM", "TRDSTAT",
"PRC", "RET", "RETX", "vwretd", "ewretd", "sprtrn")))
REPRODUCIBLE DATA
crsp <- structure(list(PERMNO = c(10006L, 10006L, 10006L, 10006L, 10006L,
10006L, 10006L, 10006L, 10006L, 10014L, 10014L, 10014L, 10014L,
10014L, 10014L, 10014L, 10014L), date = c(19251231L, 19260102L,
19260104L, 19260105L, 19260106L, 19260107L, 19301229L, 19301230L,
19301231L, 19260104L, 19260105L, 19260106L, 19260107L, 19260108L,
19260109L, 19260111L, 19260112L), TICKER = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), COMNAM = c("AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO",
"AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO",
"AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO", "AMERICAN CAR & FDRY CO",
"AMERICAN CAR & FDRY CO", "ALASKA JUNEAU GOLD MNG CO", "ALASKA JUNEAU GOLD MNG CO",
"ALASKA JUNEAU GOLD MNG CO", "ALASKA JUNEAU GOLD MNG CO", "ALASKA JUNEAU GOLD MNG CO",
"ALASKA JUNEAU GOLD MNG CO", "ALASKA JUNEAU GOLD MNG CO", "ALASKA JUNEAU GOLD MNG CO"
), TRDSTAT = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A"), PRC = c(109, 109.125, 111,
110.5, 110.5, 110.5, 24.25, 24.5, 27.5, 1.875, 1.75, 1.75, -1.8125,
1.75, -1.6875, 1.625, NA), RET = c("C", "0.001147", "0.017182",
"-0.004505", "0.000000", "0.000000", "-0.034826", "0.010309",
"0.122449", "C", "-0.066667", "0.000000", "0.035714", "-0.034483",
"-0.035714", "-0.037037", ""), RETX = c("C", "0.001147", "0.017182",
"-0.004505", "0.000000", "0.000000", "-0.034826", "0.010309",
"0.122449", "C", "-0.066667", "0.000000", "0.035714", "-0.034483",
"-0.035714", "-0.037037", ""), vwretd = c(NA, 0.005893, 0.001277,
-0.003984, -0.000172, 0.007211, -0.002367, 0.020064, 0.016505,
0.001277, -0.003984, -0.000172, 0.007211, -0.000804, 0.003384,
-0.00868, -0.001027), ewretd = c(NA, 0.009516, 0.00578, -0.001927,
0.001182, 0.008453, -0.017949, 0.016016, 0.052093, 0.00578, -0.001927,
0.001182, 0.008453, -0.001689, 0.003312, -0.009943, -0.003623
), sprtrn = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("PERMNO",
"date", "TICKER", "COMNAM", "TRDSTAT", "PRC", "RET", "RETX",
"vwretd", "ewretd", "sprtrn"), row.names = c(1L, 2L, 3L, 4L,
5L, 6L, 1483L, 1484L, 1485L, 1486L, 1487L, 1488L, 1489L, 1490L,
1491L, 1492L, 1493L), class = "data.frame")
Format date
column ( same code as above)
crsp$DATE <- as.Date(as.character(crsp$date),format="%Y%m%d")
data <- xts(crsp[,3:11], crsp$DATE)
UPDATE
Code produced by @akrun works efficiently with sample data but I am having trouble with actual dataset:
# this code works well on actual data
crsp1 <- crsp[,c(4, 6:12)];library(reshape2)
# this is the part where I am having difficulty with
crsp2 <- dcast(crsp1[,c(1:2, 8)], DATE~COMNAM, value.var="PRC")
It returns the following error: Aggregation function missing: defaulting to length
This is a sample of what crsp2
returns:
crsp2 <- structure(list(DATE = structure(c(-16072, -16070, -16068, -16067,
-16066), class = "Date"), `A P W PAPER CO` = c(0, 0, 0, 0, 0),
`ABITIBI POWER & PAPER LTD` = c(1, 1, 1, 1, 1), `ABRAHAM & STRAUSS INC` = c(0,
0, 0, 0, 0), `ADAMS EXPRESS CO` = c(1, 1, 1, 1, 1)), .Names = c("DATE",
"A P W PAPER CO", "ABITIBI POWER & PAPER LTD", "ABRAHAM & STRAUSS INC",
"ADAMS EXPRESS CO"), row.names = c(NA, 5L), class = "data.frame")
As far as format
, it returns the desired output, but the problem is that it returns 1's and 0's instead of actual values...
I believe it returns 1's where data is found/starts, and 0's where no data is found. I later tried specifying the Aggregation function
:
# Specification of aggregation function plus "fill" to try to fix length errors & "drop" to
# avoid values being dropped
crsp2 <- dcast(crsp1[,c(1:2, 8)], DATE ~ COMNAM, value.var="RET", fun.aggregate= length, fill=
0);library(plyr);
This time, I do not get any errors but still get the 1's and 0's. Perhaps it might be better to seperate the data by COMNAM
and then index by DATE
column, so that I can just merge()
or cbind()
to get the desired output. But i do not know if there is a better way? I have also tried the following functions but left them running for more than 6 hours and came back and where still running... cannot seem to make them work...
# daply() from library(plyr)
crsp2 <- daply(crsp1[,c(1:2, 8)], .(DATE,COMNAM), function(x) x$PRC, .parallel=TRUE)
# reshape()
crsp2 <- reshape(crsp1[,c(1:2, 8)], idvar="COMNAM", timevar="DATE", direction="wide")