È possibile utilizzare le seguenti tre funzioni per calcolare l'esatta dimensione del file che sarà scritto da write.table()
, write.csv()
e write.csv2()
, rispettivamente, senza dover produrre l'intero flusso di byte contemporaneamente, in memoria o su disco:
size.write.table <- function(x,...) {
x <- as.data.frame(x); ## write.table() coerces to data.frame
args <- list(...);
defaults <- formals(write.table);
## get write specs as locals
for (name in names(defaults)[-1])
assign(name,if (is.null(args[[name]])) defaults[[name]] else args[[name]]);
## normalize quote arg to logical, quoteIndexes as columns to quote
## note: regardless of qmethod, does not touch characters other than double-quote, and only adds one byte per embedded double-quote for either qmethod
quoteIndexesGiven <- F; ## assumption
if (is.logical(quote) && quote) {
quoteIndexes <- seq_along(x);
} else if (is.numeric(quote)) {
quoteIndexes <- quote;
quote <- T;
quoteIndexesGiven <- T;
}; ## end if
## normalize col.names arg to logical T/F, colNames as actual column names
emptyColNameForRowNames <- F; ## assumption
if (is.logical(col.names)) {
if (is.na(col.names)) {
emptyColNameForRowNames <- T;
col.names <- T;
}; ## end if
if (col.names) colNames <- names(x);
} else {
colNames <- as.character(col.names);
col.names <- T;
}; ## end if
## normalize row.names arg to logical, rowNames as actual row names
if (is.logical(row.names)) {
if (row.names) rowNames <- rownames(x);
} else {
rowNames <- as.character(row.names);
row.names <- T;
}; ## end if (else must be F)
## start building up file size
size <- 0L;
## 1: column header
if (col.names) {
## special case for zero columns: write.table() behaves as if there's one empty-string column name, weirdly
if (ncol(x)==0L) {
if (quote) size <- size + 2L;
} else {
if (emptyColNameForRowNames) {
if (quote) size <- size + 2L; ## two double-quotes
size <- size + nchar(sep,'bytes'); ## separator
}; ## end if
size <- size + sum(nchar(colNames,'bytes')); ## names (note: NA works with this; nchar() returns 2)
if (quote) size <- size + ncol(x)*2L + sum(do.call(c,gregexpr(perl=T,'"',colNames[quoteIndexes]))>0L); ## quotes and escapes
size <- size + nchar(sep,'bytes')*(ncol(x)-1L); ## separators
}; ## end if
size <- size + nchar(eol,'bytes'); ## eol; applies to both zero-columns special case and otherwise
}; ## end if
## 2: row names
if (row.names) {
## note: missing values are not allowed in row names
size <- size + sum(nchar(rowNames,'bytes')); ## names
size <- size + nchar(sep,'bytes')*nrow(x); ## separator (note: always present after row names, even for zero-column data.frame)
if (quote) size <- size + nrow(x)*2L + sum(do.call(c,gregexpr(perl=T,'"',rowNames))>0L); ## quotes and escapes (can ignore quoteIndexes, since row names are always quoted if any column is quoted)
}; ## end if
## 3: column content
for (ci in seq_along(x)) {
## calc depends on class
cl <- class(x[[ci]]);
## normalize date/time classes
if (identical(cl,c('POSIXct','POSIXt')) || identical(cl,c('POSIXlt','POSIXt')))
cl <- 'POSIXt';
## branch on normalized class
## note: can't write list type to file, so don't bother supporting list columns
if (length(cl)==1L && cl=='raw') {
size <- size + nrow(x)*2L;
## note: cannot have raw NAs
} else { ## remaining types can have NAs
size <- size + sum(is.na(x[[ci]]))*nchar(na,'bytes'); ## NAs
if (length(cl)==1L && cl=='logical') {
size <- size + sum((5:4)[na.omit(x[[ci]])+1L]); ## non-NAs
} else if (length(cl)==1L && cl%in%c('integer','numeric','complex','ts')) {
size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs
} else if (length(cl)==1L && cl%in%c('character','factor')) {
size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs, values -- as.character() required for factors to work
if (quote && ci%in%quoteIndexes) size <- size + sum(!is.na(x[[ci]]))*2L + sum(do.call(c,gregexpr(perl=T,'"',na.omit(x[[ci]])))>0L); ## quotes and escapes
} else if (length(cl)==1L && cl=='POSIXt') {
size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs
## special case for POSIXt: only quoted if explicitly specified by index in quote arg
if (quoteIndexesGiven && ci%in%quoteIndexes) size <- size + sum(!is.na(x[[ci]]))*2L; ## quotes (can't be any escapes)
} else {
stop(sprintf('unsupported class(es) %s.',paste(collapse=',',cl)));
}; ## end if
}; ## end if
}; ## end for
## 4: separators between columns
size <- size + nchar(sep,'bytes')*(ncol(x)-1L)*nrow(x);
## 5: eols
size <- size + nchar(eol,'bytes')*nrow(x);
size;
}; ## end size.write.table()
## note: documentation should say "col.names to NA if row.names = TRUE (the default) or given as a character vector" for csv functions
size.write.csv <- function(x,...) do.call(size.write.table,c(list(x),sep=',',dec='.',qmethod='double',col.names={ row.names <- list(...)$row.names; if (!identical(F,row.names)) NA else T; },list(...)));
size.write.csv2 <- function(x,...) do.call(size.write.table,c(list(x),sep=';',dec=',',qmethod='double',col.names={ row.names <- list(...)$row.names; if (!identical(F,row.names)) NA else T; },list(...)));
Qui sono alcune prove che dimostrano la correttezza:
size.write.func.test.impl <- function(funcName,x,...,tf='/tmp/size.write.func.test.impl.txt') {
writeFunc <- match.fun(funcName);
sizeFunc <- match.fun(paste0('size.',funcName));
writeFunc(x,tf,...);
expected <- file.info(tf)$size;
actual <- sizeFunc(x,tf,...);
cat(sprintf('%s: %d %s %d\n',if (expected==actual) 'SUCCESS' else 'FAILURE',actual,if (expected==actual) '==' else '!=',expected));
}; ## end size.write.func.test.impl()
size.write.table.test <- function(...) size.write.func.test.impl('write.table',...);
size.write.csv.test <- function(...) size.write.func.test.impl('write.csv',...);
size.write.csv2.test <- function(...) size.write.func.test.impl('write.csv2',...);
size.all.test <- function(...) {
size.write.table.test(...);
size.write.csv.test(...);
size.write.csv2.test(...);
}; ## end size.all.test()
size.all.test(data.frame(),quote=F);
## SUCCESS: 1 == 1
## SUCCESS: 1 == 1
## SUCCESS: 1 == 1
size.all.test(data.frame());
## SUCCESS: 3 == 3
## SUCCESS: 3 == 3
## SUCCESS: 3 == 3
size.all.test(data.frame(a=1:3));
## SUCCESS: 22 == 22
## SUCCESS: 25 == 25
## SUCCESS: 25 == 25
set.seed(1L);
df <- data.frame(raw=as.raw(0:255),logical=rep(c(F,T),len=256L),integer=0:255,double1=runif(256L),double2=runif(256L,-.Machine$double.xmax*0.5,.Machine$double.xmax*0.5),character=paste(sapply(0:255,intToUtf8),sample(c('','x','x"x','"x""x"'),256L,replace=T)),factor=factor(rep(letters,len=256L)),dtΩ=as.POSIXct('1970-01-01 00:00:00',tz='England/London'),stringsAsFactors=F);
for (ci in seq(2,ncol(df))) df[[ci]][sample(256L,10L)] <- NA;
head(df);
## raw logical integer double1 double2 character factor dtΩ
## 1 00 FALSE 0 0.2655087 -4.535097e+307 a 1970-01-01
## 2 01 TRUE 1 0.3721239 -2.670418e+305 \001 x"x b 1970-01-01
## 3 02 FALSE 2 0.5728534 -2.285466e+307 \002 x"x c 1970-01-01
## 4 03 TRUE 3 0.9082078 7.814417e+307 \003 d 1970-01-01
## 5 04 NA 4 0.2016819 4.311961e+306 \004 x"x e 1970-01-01
## 6 05 TRUE 5 0.8983897 -3.287178e+307 \005 x"x f 1970-01-01
size.all.test(df);
## SUCCESS: 20634 == 20634
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
size.all.test(df,eol='zzz');
## SUCCESS: 21148 == 21148
## SUCCESS: 21151 == 21151
## SUCCESS: 21151 == 21151
size.all.test(df,sep='///'); ## csv incarnations take ownership of their overridden arguments
## SUCCESS: 24744 == 24744
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
## Warning messages:
## 1: In writeFunc(x, tf, ...) : attempt to set 'sep' ignored
## 2: In writeFunc(x, tf, ...) : attempt to set 'sep' ignored
size.all.test(df,quote=F);
## SUCCESS: 18807 == 18807
## SUCCESS: 18808 == 18808
## SUCCESS: 18808 == 18808
size.all.test(df,quote=seq(2L,ncol(df),by=2L));
## SUCCESS: 20634 == 20634
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
size.all.test(df,row.names=F);
## SUCCESS: 19206 == 19206
## SUCCESS: 19206 == 19206
## SUCCESS: 19206 == 19206
size.all.test(df,row.names=seq(1234,len=nrow(df)));
## SUCCESS: 20998 == 20998
## SUCCESS: 21001 == 21001
## SUCCESS: 21001 == 21001
size.all.test(df,na='blah');
## SUCCESS: 20774 == 20774
## SUCCESS: 20777 == 20777
## SUCCESS: 20777 == 20777
size.all.test(iris);
## SUCCESS: 4818 == 4818
## SUCCESS: 4821 == 4821
## SUCCESS: 4821 == 4821
size.all.test(USAccDeaths);
## SUCCESS: 724 == 724
## SUCCESS: 727 == 727
## SUCCESS: 727 == 727
size.all.test(USArrests);
## SUCCESS: 1384 == 1384
## SUCCESS: 1387 == 1387
## SUCCESS: 1387 == 1387
size.all.test(USArrests,eol='\r\n'); ## you're probably on Windows
## SUCCESS: 1435 == 1435
## SUCCESS: 1438 == 1438
## SUCCESS: 1438 == 1438
Per l'avviso, è possibile semplicemente confrontare l'impronta di memoria dell'oggetto all'interno di R. Se è maggiore di una soglia, è possibile emettere un avviso. –
È possibile trovare [questo articolo] (https://data.nozav.org/posts/2016/02/compression_benchmark.html) pertinente: sulla dimensione dell'oggetto rispetto alla dimensione del file con vari formati di file. Per la dimensione dell'oggetto utilizzata nel benchmark, la differenza rispetto al file csv era "piuttosto grande" (343,3 MB vs 2885 MB) – Henrik