您的位置:首页 > 大数据

学长写的一个处理大数据多个文件的排序算法

2014-04-26 19:20 302 查看
/***************************************************************************

*

* Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved

* $Id$

*

**************************************************************************/

/**

* @file largeSort.c

* @author liuyi(liuyi04@baidu.com)

* @date 2014/04/08 19:51:59

* @version $Revision$

* @brief

* sort largeScale text data

**/

#include<stdio.h>

#include<stdlib.h>

#include<string.h>

#include<time.h>

#define MAXLINE 110000

#define MAXLEN 1024

/**********************小文件的内排序**********************/

int compare_str(const void *p, const void *q)

{

return strcmp((char *)p, (char *)q);

}

int compare_str2(const void* p, const void* q)

{

return strcmp(*(char**)p, *(char**)q);

}

void sort_file(char* in_file,char* out_file)//排序单个文件

{

//char str[MAXLINE][MAXLEN];

char* str[MAXLINE],

*temp = (char *)malloc(sizeof(char)*MAXLEN);

FILE *fp1, *fp2;

int len = 0, i = 0;

if(NULL == (fp1 = fopen(in_file,"r")))

{

printf("cannot open %s file\n",in_file);

exit(0);

}

while(fgets(temp,MAXLEN,fp1) != NULL)

{

str[len] = temp;

temp=(char *)malloc(sizeof(char)*MAXLEN);

++len;

}

free(temp);

fclose(fp1);

/*printf("原数据:\n");

for (i = 0; i<len; ++i)

printf("%s",str[i]);

printf("\n");*/

//qsort(str,len,sizeof(str[0]),compare_str);

qsort(str,len,sizeof(char*),compare_str2);

/*printf("排序后:\n");

for (i = 0; i<len; ++i)

printf("%s",str[i]);

printf("\n");*/

if (NULL == (fp2 = fopen(out_file,"w")))

{

printf("cannot open %s file\n",out_file);

exit(0);

}

for(i=0; i<len; ++i)

{

fputs(str[i],fp2);

free(str[i]);

//fputs("\n",fp2);

}

fclose(fp2);

}

void sort_all(int file_num)

{

int i = 0;

char in_file[10]="part.", out_file[10]="in.",

digit[2];

for (; i<file_num; ++i)

{

sprintf(digit,"%d",i);

strcat(in_file,digit);

strcat(out_file,digit);

sort_file(in_file,out_file);

//printf("%s has been sorted to %s\n",in_file,out_file);

in_file[5] = '\0';

out_file[3] = '\0';

}

}

/**********************分割文件***********************/

int partion_file(int file_num, int file_size)//将file_num个文件分别分割成file_size行的小文件,返回小文件数目

{

char in_file[10]="text.", out_file[10]="part.",

digit[2];

FILE *fp1, *fp2;

int i=0,

file_line = 0, //所有文件总的行数

small_file_num = 0; //记录小文件的数目

char *temp = (char *)malloc(sizeof(char)*MAXLEN);

for (i = 0; i<file_num; ++i)

{

sprintf(digit,"%d",i);

strcat(in_file,digit);

if(NULL == (fp1 = fopen(in_file,"r")))

{

printf("cannot open %s file\n",in_file);

exit(0);

}

while(fgets(temp,MAXLEN,fp1) != NULL)

{

if (0 == file_line%file_size)//每达到一个file_size就新建一个文件

{

small_file_num = file_line/file_size;

char partion_file_suffix[3];

//sprintf(partion_file_suffix,"%d",small_file_num);

sprintf(partion_file_suffix,"%d",file_line/file_size);

strcat(out_file,partion_file_suffix);

if (NULL == (fp2 = fopen(out_file,"w")))

{

printf("cannot open %s file\n",out_file);

exit(0);

}

}

//printf("%s",temp);

fputs(temp,fp2);//将数据写入fp2指向的小文件中

if (0 == (file_line+1)%file_size)//小文件中最后一个数据,关闭该小文件

{

int j = 0;

for (; temp[j] != '\0';++j) {}

if (temp[j-1] != '\n')

fputs("\n",fp2);

fclose(fp2);

out_file[5] = '\0';

}

++file_line;

}

//printf("%s has been splited\n",in_file);

in_file[5] = '\0';

fclose(fp1);

}

if (0 != (file_line+1)%file_size) //关闭最后一个文件

{

int j = 0;

for (; temp[j] != '\0';++j) {}

if (temp[j-1] != '\n')

fputs("\n",fp2);//分割后的文件加入一个换行符

fclose(fp2);

}

free(temp);

return small_file_num;

}

/**************************外部排序(利用败者树)***********************/

void adjust(int* ls, char**b, int k, int s)

{//沿从叶子结点b[s]到根结点ls[0]的路径调整败者树。

int i, t;

t = (s+k)/2;

while(t>0)

{

if (strcmp(b[s],b[ls[t]]) > 0) //有问题

{

i = s;

s = ls[t];

ls[t] = i;

}

t /= 2;

}

ls[0] = s;

}

void create_loser_tree(int* ls, char** b, int k)

{//b[0]到b[k-1]为完全二叉树ls的叶子结点,存有k个结点

//沿从叶子到根的k条路径将ls调整成为败者树

int i;

strcpy(b[k],"\0\0");//将b[k]设置为关键字可能的最小值

for (i = 0; i<k; ++i)

ls[i] = k;

for (i = k-1; i>=0; --i)

adjust(ls,b,k,i);

}

void k_merge(int* ls, char** b, int k, FILE** fp)

{

int i, q;

FILE* fp_out;

char MAXKEY[10]="z\0";

MAXKEY[0] = (char)255;

if (NULL == (fp_out = fopen("sorted","w")))

{

printf("cannot open sorted file\n");

exit(0);

}

for (i=0; i<k; ++i)//分别从K个外部文件读入首关键字

{

fgets(b[i],MAXLEN,fp[i]);

//printf("%s",b[i]);//

}

//printf("\n");//

create_loser_tree(ls,b,k);//

while(strcmp(b[ls[0]],MAXKEY) != 0)

{

q = ls[0];

fputs(b[q],fp_out);//将当前最小值输出到结果文件中

if(NULL == fgets(b[q],MAXLEN,fp[q]))//该段已为空,将该数设置为最大

{

strcpy(b[q],MAXKEY);

//printf("in.%d has completed\n",q);

}

adjust(ls,b,k,q);

}

fclose(fp_out);

}

void external_sort(int file_num)

{

int i = 0, k = file_num;//将文件数k直接作为段数进行归并

int* ls = (int*) malloc(sizeof(int)*file_num);

char** b= (char**)malloc(sizeof(char*)*(file_num+1));//b[k]设置为MINKEY

for (i = 0; i<k+1; ++i)

b[i] = (char*)malloc(sizeof(char)*MAXLEN);

FILE** fp = (FILE**)malloc(sizeof(FILE*)*file_num);

for (i=0; i<k; ++i)//分别打开k个外部文件准备读入首关键字

{

char in_file[10] = "in.", file_No[3];

sprintf(file_No,"%d",i);

strcat(in_file,file_No);

if (NULL == (fp[i] = fopen(in_file,"r")))

{

printf("cannot open %s\n",in_file);

exit(0);

}

}

k_merge(ls,b,k,fp);

for (i = 0; i<k+1; ++i)

{

if (i<k)

fclose(fp[i]);

free(b[i]);

}

free(ls);

}

int main()

{

clock_t start, end;

int file_num = 12, file_size = 100000, small_file_num;

start = clock();

small_file_num = partion_file(file_num, file_size);

printf("No. of small files:%d\n",small_file_num+1);

sort_all(small_file_num+1);

external_sort(small_file_num+1);

end = clock();

printf("the running time is: %fs\n", (double)(end-start)/CLOCKS_PER_SEC);

return 0;

}

/* vim: set ts=4 sw=4 sts=4 tw=100 noet: */
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐