I am working with a text file where data look like below
*******************************
Sensor 1028 at site 101
SID = 16384
Tag = AI.1028.BT.VOLT
04/07/16 05:00:00 12.65
04/07/16 06:00:00 12.64
04/07/16 07:00:00 12.68
04/07/16 08:00:00 13.08
04/07/16 09:00:00 13.76
*******************************
Sensor 1171 at well 102
SID = 20062
Tag = AI.1171.WT.LEV
04/07/16 05:00:00 0.95
04/07/16 06:00:00 0.90
04/07/16 07:00:00 0.82
04/07/16 08:00:00 0.71
04/07/16 09:00:00 0.59
04/07/16 10:00:00 0.48
I want to be able to extract the data for each tag and create a data frame as below-
Tag Timestamp Value
1028 04/07/16 05:00:00 12.65
1028 04/07/16 06:00:00 12.64
1028 04/07/16 07:00:00 12.68
1028 04/07/16 08:00:00 13.08
1028 04/07/16 09:00:00 13.76
1171 04/07/16 05:00:00 0.95
1171 04/07/16 06:00:00 0.90
1171 04/07/16 07:00:00 0.82
1171 04/07/16 08:00:00 0.71
1171 04/07/16 09:00:00 0.59
1171 04/07/16 10:00:00 0.48
Tag is the numeric part in the patter such as 1028 in "Tag = AI.1028.BT.VOLT" and 1171 in "Tag = AI.1171.WT.LEV".
I have looked at other questions on similar lines but I am relatively new to R and other than importing the text file using readLines
and extracting the pattern using grep
, I couldn't do anything.
Any help would be greatly appreciated. Thanks!
Using the data.table
package, I would approach it as follows:
sensortext <- readLines('sensors.txt')
library(data.table)
DT <- data.table(txt = sensortext[!grepl(pattern = '\\*+', sensortext)])
DT <- DT[, grp := cumsum(grepl('Sensor', txt))
][, `:=` (tag = as.numeric(gsub('^.*(\\d+{4}).*','\\1', grep('Tag =', txt, value = TRUE))),
sid = as.numeric(gsub('^.*(\\d+{5}).*','\\1', grep('SID = ', txt, value = TRUE))),
type = strsplit(grep('Sensor ', txt, value = TRUE),' ')[[1]][4],
type.nr = as.numeric(gsub('^.*(\\d+{3}).*','\\1', grep('Sensor ', txt, value = TRUE)))),
by = grp
][, .SD[4:.N], by = grp
][, c('datetime','value') := tstrsplit(txt, '\\s+{2}', type.convert = TRUE)
][, c('grp','txt') := NULL
][, datetime := as.POSIXct(strptime(datetime, "%d/%m/%y %H:%M:%S"))]
which gives:
> DT
tag sid type type.nr datetime value
1: 1028 16384 site 101 2016-07-04 05:00:00 12.65
2: 1028 16384 site 101 2016-07-04 06:00:00 12.64
3: 1028 16384 site 101 2016-07-04 07:00:00 12.68
4: 1028 16384 site 101 2016-07-04 08:00:00 13.08
5: 1028 16384 site 101 2016-07-04 09:00:00 13.76
6: 1171 20062 well 102 2016-07-04 05:00:00 0.95
7: 1171 20062 well 102 2016-07-04 06:00:00 0.90
8: 1171 20062 well 102 2016-07-04 07:00:00 0.82
9: 1171 20062 well 102 2016-07-04 08:00:00 0.71
10: 1171 20062 well 102 2016-07-04 09:00:00 0.59
11: 1171 20062 well 102 2016-07-04 10:00:00 0.48
Explanation:
- With the
readLines
function you read the textfile. After that, you convert it to a 1 column datatable data.table(txt = sensortext[!grepl(pattern = '\\*+', sensortext)])
.
- With
[, grp := cumsum(grepl('Sensor', txt))]
you create a grouping variable that separates the different dataparts. grepl('Sensor', txt)
creates a logical value detecting the lines that start with Sensor
(and indicate the start of a new datapart). Using cumsum
on that creates an grouping variable.
- With
tag = as.numeric(gsub('^.*(\\d+{4}).*','\\1', grep('Tag =', txt, value = TRUE)))
you extract the tag-number (as well as for sid
, type
& type.nr
).
- With
[, .SD[4:.N], by = grp]
you remove the first three lines per group (because they do not contain data and the needed info is already extracted in the previous steps).
- With
[, c('datetime','value') := tstrsplit(txt, '\\s+{2}', type.convert = TRUE)]
you convert the data which is still in text format in the txt
column into three data columns. The type.convert = TRUE
makes sure that the value
column gets the right format (numeric in this case).
- Remove the
grp
and txt
columns with [, c('grp','txt') := NULL]
(because they are not needed anymore).
- And finally convert the
datetime
column to POSIXct
format with as.POSIXct(strptime(datetime, "%d/%m/%y %H:%M:%S"))
.
To see what each step does, you can also use the following code:
DT[, grp := cumsum(grepl('Sensor', txt))]
DT[, `:=` (tag = as.numeric(gsub('^.*(\\d+{4}).*','\\1', grep('Tag =', txt, value = TRUE))),
sid = as.numeric(gsub('^.*(\\d+{5}).*','\\1', grep('SID = ', txt, value = TRUE))),
type = strsplit(grep('Sensor ', txt, value = TRUE),' ')[[1]][4],
type.nr = as.numeric(gsub('^.*(\\d+{3}).*','\\1', grep('Sensor ', txt, value = TRUE)))),
by = grp][]
DT <- DT[, .SD[4:.N], by = grp][]
DT[, c('datetime','value') := tstrsplit(txt, '\\s+{2}', type.convert = TRUE)][]
DT[, c('grp','txt') := NULL][]
DT[, datetime := as.POSIXct(strptime(datetime, "%d/%m/%y %H:%M:%S"))][]
Adding []
to each line, makes sure that the result gets printed to the console.
An alternative with base R:
sensortext <- readLines('sensors.txt')
rawlist <- split(sensortext, cumsum(grepl(pattern = '\\*+', sensortext)))
l <- lapply(rawlist, function(x) read.fwf(textConnection(x[-c(1:4)]), widths = c(17,7), header = FALSE))
reps <- sapply(l, nrow)
df <- do.call(rbind, l)
df$V1 <- strptime(df$V1, '%d/%m/%y %H:%M:%S')
names(df) <- c('datetime','value')
df$tag <- rep(as.numeric(gsub('^.*(\\d+{4}).*','\\1', grep('Tag =', sensortext, value = TRUE))), reps)
df$sid <- rep(as.numeric(gsub('^.*(\\d+{5}).*','\\1', grep('SID = ', sensortext, value = TRUE))), reps)
df$type <- rep(sapply(strsplit(grep('Sensor ', sensortext, value = TRUE),' '), '[', 4), reps)
df$type.nr <- rep(as.numeric(gsub('^.*(\\d+{3}).*','\\1', grep('Sensor ', sensortext, value = TRUE))), reps)
which gives the same result:
> df
datetime value tag sid type type.nr
1.1 2016-07-04 05:00:00 12.65 1028 16384 site 101
1.2 2016-07-04 06:00:00 12.64 1028 16384 site 101
1.3 2016-07-04 07:00:00 12.68 1028 16384 site 101
1.4 2016-07-04 08:00:00 13.08 1028 16384 site 101
1.5 2016-07-04 09:00:00 13.76 1028 16384 site 101
2.1 2016-07-04 05:00:00 0.95 1171 20062 well 102
2.2 2016-07-04 06:00:00 0.90 1171 20062 well 102
2.3 2016-07-04 07:00:00 0.82 1171 20062 well 102
2.4 2016-07-04 08:00:00 0.71 1171 20062 well 102
2.5 2016-07-04 09:00:00 0.59 1171 20062 well 102
2.6 2016-07-04 10:00:00 0.48 1171 20062 well 102
Here's one approach using dplyr, tidyr, and a text reshaping package I have on GitHub for the splitting of the file.
Your Data
x <- c("*******************************", "Sensor 1028 at site 101",
"SID = 16384", "Tag = AI.1028.BT.VOLT", "04/07/16 05:00:00 12.65",
"04/07/16 06:00:00 12.64", "04/07/16 07:00:00 12.68", "04/07/16 08:00:00 13.08",
"04/07/16 09:00:00 13.76", "*******************************",
"Sensor 1171 at well 102", "SID = 20062", "Tag = AI.1171.WT.LEV",
"04/07/16 05:00:00 0.95", "04/07/16 06:00:00 0.90", "04/07/16 07:00:00 0.82",
"04/07/16 08:00:00 0.71", "04/07/16 09:00:00 0.59", "04/07/16 10:00:00 0.48"
)
Code to read and reshape the data
if (!require("pacman")) install.packages("pacman"); library(pacman)
p_load_current_gh('trinker/textshape')
p_load(dplyr, tidyr)
## x <- readLines('myfile.txt')
split_match(x, "\\*{5,}", regex=TRUE) %>% ## find multi asterisks and split
lapply(function(x){
data_frame(Timestamp_Value = x[-c(1:3)]) %>% ## make dataframe of n-first 3 elements
separate(Timestamp_Value, c('Timestamp', 'Value'), sep = "\\s{2,}") %>% # split timestamp and value apart
mutate(Tag = gsub("(^[A-Za-z ]+)(\\d+)(\\s.+$)", "\\2", x[1])) %>% ## add Tag with regex gsub
select(3, 1:2)
}) %>%
bind_rows() %>% ## bind list of data frames together
mutate(
Tag = as.numeric(Tag),
Timestamp = as.POSIXct(strptime(Timestamp, "%d/%m/%y %H:%M:%S")),
Value = as.numeric(Value)
) ## add appropriate classes
Yields:
Source: local data frame [11 x 3]
Tag Timestamp Value
(dbl) (time) (dbl)
1 1028 2016-07-04 05:00:00 12.65
2 1028 2016-07-04 06:00:00 12.64
3 1028 2016-07-04 07:00:00 12.68
4 1028 2016-07-04 08:00:00 13.08
5 1028 2016-07-04 09:00:00 13.76
6 1171 2016-07-04 05:00:00 0.95
7 1171 2016-07-04 06:00:00 0.90
8 1171 2016-07-04 07:00:00 0.82
9 1171 2016-07-04 08:00:00 0.71
10 1171 2016-07-04 09:00:00 0.59
11 1171 2016-07-04 10:00:00 0.48
Here's a base R solution. We locate the rows that start with Sensor
or a date, and then extract the latter rows between each pair of Sensor
rows.
date_rows <- grep("^\\d+/", s)
# [1] 4 5 6 7 8 13 14 15 16 17 18
sensor_rows <- grep("^Sensor", s)
# [1] 1 10
tab <- lapply(seq_along(sensor_rows), function(x) {
tag <- sub("^Sensor (\\d+)+.*", "\\1", s[sensor_rows[x]])
last_row <- ifelse(is.na(sensor_rows[x + 1L]), length(s), sensor_rows[x + 1L] - 1L)
rows <- seq.int(sensor_rows[x], last_row)
# extract date rows between the current tag line and the next
values <- s[intersect(rows, date_rows)]
data.frame(tag = rep(tag, length(values)), value = values, stringsAsFactors = FALSE)
})
tab <- do.call("rbind", tab)
So far, we've assigned a tag to each line of data and moved them into a table.
head(tab)
# tag value
# 1 1028 04/07/16 05:00:00 12.65
# 2 1028 04/07/16 06:00:00 12.64
# 3 1028 04/07/16 07:00:00 12.68
# 4 1028 04/07/16 08:00:00 13.08
# 5 1028 04/07/16 09:00:00 13.76
# 6 1171 04/07/16 05:00:00 0.95
All that's left is to split the strings in the value column. Incidentally this would be a one-liner with data.table's tstrsplit.
split_values <- t(data.frame(strsplit(tab$value, "\\s+"), stringsAsFactors = FALSE))
colnames(split_values) <- c("date", "time", "value")
tab <- as.data.frame(cbind(tag = tab[, "tag"], split_values), stringsAsFactors = FALSE,
row.names = FALSE)
tab$timestamp <- paste(tab$date, tab$time)
tab <- tab[, c("tag", "timestamp", "value")]
str(tab)
# 'data.frame': 11 obs. of 3 variables:
# $ tag : chr "1028" "1028" "1028" "1028" ...
# $ timestamp: chr "04/07/16 05:00:00" "04/07/16 06:00:00" "04/07/16 07:00:00" "04/07/16 08:00:00" ...
# $ value : chr "12.65" "12.64" "12.68" "13.08" ...
And here's how I read the data:
s = scan(what = "character", fill = TRUE, blank.lines.skip = TRUE, sep = "\n",
text = "Sensor 1028 at site 101
SID = 16384
Tag = AI.1028.BT.VOLT
04/07/16 05:00:00 12.65
04/07/16 06:00:00 12.64
04/07/16 07:00:00 12.68
04/07/16 08:00:00 13.08
04/07/16 09:00:00 13.76
*******************************
Sensor 1171 at well 102
SID = 20062
Tag = AI.1171.WT.LEV
04/07/16 05:00:00 0.95
04/07/16 06:00:00 0.90
04/07/16 07:00:00 0.82
04/07/16 08:00:00 0.71
04/07/16 09:00:00 0.59
04/07/16 10:00:00 0.48")