在Ubuntu下生成大文件,并测试文件读写效率

Posted on 2010-05-23 Edited on 2023-10-05 Word count in article: 4.1k Reading time ≈ 4 mins.

在ubuntu或者其他的linux下生成大文件,必须使用编译参数 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64.否
则生成的文件一定是2G左右的,原因是fseek,这个函数中的offset参数是一个long型的或者是size_t型的,在fseeko中是off_t,但是这个
off_t默认还是32位,只有加了 -D_FILE_OFFSET_BITS=64的宏定义才被作为64位对待.

写了下面一段代码来生成大文件,传说有一种方法:直接跳转nG位置之后,然后写入一个字符.这种方法可能更快,不过我为了保守的生成文件,还是使用下面的一个写法:

#include<stdio.h> #include<stdlib.h> #include<unistd.h> #include<string.h>
void gen_file(const char filename, int block_size, int count){ int is_exist =
0; FILE file = fopen(filename,“w+b”); char * buf = NULL; int i=0; long long
size = block_size; if(file == NULL){ printf(“fatal error in gen_file:
fopen/n”); exit(1); } buf = (char )malloc(sizeof(char)block_size);
memset(buf,‘1’,sizeof(char)block_size); while(i++<count){ if(i % 10 == 0){
printf(“count is %d, size is %lld/n”,i,size); } fwrite(buf,block_size,1,file);
size += block_size; fseeko(file,size,SEEK_SET); } fclose(file); free(buf); /
long long size = 102410241024; size =10; fseeko(file,size,SEEK_SET);
fwrite(buf,block_size,1,file); fclose(file); free(buf); / } int main(){
printf(“size of char %d/n”,sizeof(char)); int block_size = 1024102420; //
20M int count = 50; gen_file(“smallfile”,block_size,count*2);// gen 2G file
return 0; }

int block_size = 1024102420; // 20M

这个是每次写入的数据大小,gen_file(“smallfile”,block_size,count*2);第三个参数可以调节生成文件的大小.

下面写了一个测试大文件和小文件的随机读写和顺序读写.

#include<stdio.h> #include<time.h> #include<sys/stat.h> #include<sys/types.h>
#include<unistd.h> #include<stdlib.h> #include<sys/time.h> #include<string.h>
const int block_size = 8092; struct timeval start; struct timeval end; struct
timezone tz; void start_time(){ gettimeofday(&start,&tz); } void end_time(){
gettimeofday(&end,&tz); } void print_time(){ printf(“%lf sec/n”,(end.tv_sec-
start.tv_sec)+(double)(end.tv_usec-start.tv_usec)/1000000); } size_t
look_size(const char * filename){ struct stat st; stat(filename,&st); return
st.st_size; } void read_blocks(FILE* file, off_t offset,int block_size){ int
where = offsetblock_size; fseeko(file,where,0); } FILE fopen_safe(const char

filename){ FILE* file = fopen(filename,“r”); if(file == NULL){ printf(“fatal
error in random_read, open file”); exit(1); } return file; } void
random_read(const char * filename){ int random; size_t size; FILE* file =
fopen_safe(filename); size = look_size(filename)/block_size; random =
rand()%size; read_blocks(file,random,block_size); fclose(file); } void
order_read(const char filename,int order){ size_t size; FILE file =
fopen_safe(filename); size = look_size(filename)/block_size;
read_blocks(file,order%size,block_size); fclose(file); } void
random_test(const char * filename,int times){ int count = 0; start_time();
while(count++<times){ // access Big File random_read(filename); } end_time();
print_time(); } void order_test(const char * filename, int times){ int count =
0; start_time(); while(count++<times){ order_read(filename,count); }
end_time(); print_time(); } void random_test_diff(const char * basename,int
num,int times){ int count = 0; char ** names = malloc(sizeof(char)*num); int
len = strlen(basename); int i = 0; while( i < num){ names[i] =
malloc(sizeof(char)*len+4); sprintf(names[i],“%s_%d”,basename,i); i++; }
start_time(); while(count++<times){ if(count < 900){ random_read(names[0]); }
else { if(count < 960){ random_read(names[count%num]);
random_read(names[(count+1)%num]); } if(count < 1000){
random_read(names[count%num]); random_read(names[(count+1)%num]);
random_read(names[(count+2)%num]); } } } end_time(); print_time(); } int
main(){ int i,count; int times = 1000; count = 0; // clock_t start,end;
srand(time(NULL)); char * filename=“bigfile”; //printf(“time is %lf”,((double
)(end-start))/CLOCKS_PER_SEC); random_test(filename,times);
order_test(filename,times); times = times; filename = “smallfile”;
random_test(filename,times); order_test(filename,times);
random_test_diff(filename,3,times); return 0; }

当然你这里做测试的时候可以把gettimeofday替换为clock()或者time()函数,其中clock()是程序实际运行的时间,你感觉可能执行了七八秒
,其实程序只占用cpu执行了1秒.如果是time()函数或者gettimeofday()函数就是真正的运行时间也就是你感觉执行的时间.所以各有千秋,前者是为
了排除各种运行环境中的软件或者os的不同,单独检测程序的效率.后者是在系统环境中程序的执行时间.

下面是我得到的结论,创建1个10G文件,创建3个2G小文件.
大文件随机访问7.199645 sec
大文件顺序访问0.013617 sec
单个小文件随机访问5.119245 sec
小文件顺序访问0.014190 sec
多个小文件的随机访问测试:90%的几率访问同一个小文件，6%访问两个小文件，4%访问3个小文件 9.791248
sec.并且90%几率发生在前,所以如果随机发生多个文件的随机访问,那么可能速度会更慢.这里慢的原因,是因为fopen.

这也就是数据库为什么只有一个数据文件的原因.有时进行文件切分,反而不利于效率的提高.但是对于100G以上的大文件,是否这个结论又要改变了呢?我没有做过这个实
验.