您的位置:首页 > 其它

R 笔记:大型数据文件流读取与写入

2014-08-09 23:23 267 查看
# ------------clear existed variants------------
rm(list=ls())

START_TIME <- Sys.time()
StopWatch <- function(start_time){
dt <- difftime(Sys.time(), start_time, units='secs')
print(paste('Time Cost', format(.POSIXct(dt, tz="GMT"), "%H:%M:%S"), sep=': '))
}

path <- "C:/Users/Public/Data Analysis in R"
if (!file.exists(path)){
print("Creating work directory...")
dir.create(path)
}
setwd(path)

data_folder <- "data"
filename <- "data.csv"
filename <- paste(data_folder, filename, sep='/')

if (!file.exists(filename)){
print('Data file does not exists.')
quit()
}

resample <- function(mtime, btime, border){
return (abs(as.numeric(difftime(mtime, btime), units='hours')) > border)
}

border <- 24
resample_length_first <- 0
resample_length_second <- 0
data_length <- 1

fcon <- file(filename, open='r')
line <- readLines(fcon, n=1)
print('Titles>>>')
print(strsplit(line, split=';')[[1]])
while(length(line) != 0){
if(grepl(";", line)){
mtime <- line
if (data_length == 2){
stime <- strsplit(mtime, split=';')[[1]][1]
print(paste('startTime', stime, sep=': '))
}
}else{
print("Unexpected line:")
print(data_length)
print(line)
}
line <- readLines(fcon, n=1)
data_length <- data_length + 1
}
close(fcon)
etime <- strsplit(mtime, split=';')[[1]][1]
print(paste('endTime', etime, sep=': '))
print(paste('Count', data_length, sep=': '))

StopWatch(START_TIME)

scon <- file(filename, open='r')
line <- readLines(scon, n=1)
while(length(line) != 0){
line <- readLines(scon, n=1)
if(length(line) > 0 && grepl(";", line)){
mtime <- strsplit(line, ';')[[1]][1]
if(resample(mtime, stime, border)){
resample_length_first = resample_length_first + 1
}
if(resample(mtime, etime, border)){
resample_length_second = resample_length_second + 1
}
}
}
close(scon)

interval <- 1048570
resample_length <- resample_length_first + resample_length_second
interval <- ceiling((data_length - resample_length) / (interval - resample_length))
print(paste("Interval", interval, sep=': '))

StopWatch(START_TIME)

idx <- 1
idx_tmp <- 0
tcon <- file(filename, open='r')
d_con <- file("resample.csv", open='w')
line <- readLines(tcon, n=1)
while(length(line) != 0){
if(idx <= resample_length_first || idx >= data_length - resample_length_second){
writeLines(line, d_con)
}else{
if(idx_tmp %% interval == 0){
writeLines(line, d_con)
}
idx_tmp = idx_tmp + 1
}
line <- readLines(tcon, n=1)
idx <- idx + 1
}
close(d_con)
close(tcon)

StopWatch(START_TIME)

 

1. 对于体积较大的csv文件,不仅用Microsoft excel打不开,而且在用R处理时,使用read.csv()方法也不能全部打开,所以使用R中的readLines()和writeLines()方法,减少内存消耗。

2. 对数据文件起始位置定位暂时想不到好的办法,不得不遍历两次。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  R 数据处理