您的位置:首页 > 编程语言

超过6G的大文件是如何读取的,附源代码。含有网上很少有论述的信息

2017-12-29 10:47 786 查看


因为工作关系,需要从大文件中筛选数据,进行比对。文件大小一般都在6g左右。
读取大文件有如下两种方法,一是用fopen打开文件,fgetline循环读取,fclose关闭文件;二是用open打开函数,用lseek获取文件大小,用mmap大文件内存映射,用munmap关闭内存映射,用close关闭文件句柄。方式一教慢,就不再详细描述。主要描述方式二。

方式二,网上介绍也有很多,但是鲜有介绍当大于4G后,读取方法。用long型读取文件大小时,最多是4294967295个字节,也就是4G。解决方法是用long long来读取文件的大小。

样例代码如下:

#include<stdio.h>

#include<string.h>

#include<stdlib.h>

#include <fcntl.h>

#include <sys/stat.h>

#include <sys/time.h> 

#include <sys/mman.h> 

#include <sys/types.h>

#include <errno.h>

#include <unistd.h>

using namespace std;

#include "dlist.h"

//呼叫流程数组

typedef struct s_callflow

{

   char identifier[100];

   char billingid[200];

   bool response;

   s_callflow()

   {

      strcpy(identifier,"");

      strcpy(billingid,"");

      response=false;

   }

} callflow;

callflow *call_instance(char *identifier,char *billingid ,bool response) 



    callflow *call_ptr; 

    call_ptr = (callflow *)malloc(sizeof(callflow)); 

    if( call_ptr==NULL ) 

        return NULL; 

   

    strcpy(call_ptr->identifier,identifier);

    strcpy(call_ptr->billingid,billingid);

    call_ptr->response = response;

    return call_ptr; 



void updatecall(DList *list,char *identifier)

{

    if(list==NULL) return;

    DListElmt *new_element;

    new_element = list->tail;   

    while(new_element!=NULL)

    {

        callflow * flow = (callflow *)new_element->data;

        if(flow!=NULL)

        {

            if(strcmp(flow->identifier,identifier)==0 && flow->response==false)

            {

                flow->response=true;

                break;

            }

        }

        if( new_element == list->head )

        {

            break;

        }

        new_element = new_element->prev;

    }

  

}

/*destroy */ 

void destroy(void *data) 



    free(data); 

    return; 



void output(DList *list,bool calling)

{

    if(list==NULL) return;

    DListElmt *new_element;

    new_element=list->head;

    int count = list->size;

    int response=0;

    FILE *out;

    if(calling==1)

    {

        out=fopen("calling.txt","w");

    }

    else

    {

        out=fopen("called.txt","w");

    }

    char buffer[255];

    while(new_element!=NULL)

    {

         callflow * flow = (callflow *)new_element->data;

         if(flow!=NULL)

         {

            if(flow->response==true)

            {

               response++;

            }

            else

            {

                 sprintf(buffer,"billingid=%s identifier=%s\n",flow->billingid,flow->identifier);

                 fwrite(buffer,strlen(buffer),1,out);

            }

         }

         new_element=new_element->next;

    }

     sprintf(buffer,"count=%d response=%d\n",count,response);

     fwrite(buffer,strlen(buffer),1,out);

     fclose(out);

}

//

int main(int argc,char * argv[])

{

   //size_t lsize=0;

   long long lsize=0;

   const char *localpc="16592304";

   DList calling_node;

   DList called_node;

   dlist_init(&calling_node, destroy);

   dlist_init(&called_node, destroy); 

   char opc[3][200];

   char dpc[3][200];

   char identifier[2][100];

   char billingID[2][200];

   //FILE * fp=NULL;

   //fp = fopen(argv[1],"r");

   char *pBuffer=NULL; 

   char *pStart=NULL,*pEnd=NULL; 

   int fd = open(argv[1],O_RDONLY);

   //size_t nFileSize=0; 

   //size_t nOffset=0; 

   //size_t nLineAmount=0; 

   //struct stat fileState; 

   //fstat(fd,&fileState); 

   //nFileSize=fileState.st_size;

   long long nFileSize=0;

   long long nOffset;

   long long nLineAmount;

   nFileSize =(long long)lseek(fd,0,SEEK_END);

   //nFileSize = (unsigned long)fileState.st_size;

   pBuffer=(char *)mmap(NULL,nFileSize,PROT_READ,MAP_SHARED,fd,0); 

   pEnd=pStart=pBuffer;

   char line[2048];

   int flag;

   flag=-1;

   int in=-1;

   printf("nFileSize=%lld\n",nFileSize);

   int load = 0;

   int preload = 0;

   while(lsize<nFileSize-4)

   {

       lsize++;

       load = (int) (((float)lsize / (float)nFileSize) * 100);

       if(preload!=load)

       {

          printf("%3d \n",load);

          preload = load;

          sleep(1);

          printf("\b");

       }

       char a = *pEnd;

       if(a==13)

       {

          char b = *(++pEnd);

          if(b==10)

          {

               int len = pEnd-pStart;

               if(len<1) break;

               if(len>2047) break;

               strncpy(line,pStart,len);

               pStart=pEnd+1;

               if(strstr(line," OPC")!=NULL)

               {

                   sscanf(line,"%s%s%s",opc[0],opc[1],opc[2]);

               }

               else if(strstr(line," DPC")!=NULL)

               {

                   sscanf(line,"%s%s%s",dpc[0],dpc[1],dpc[2]);

               }

               else if(strstr(line,"queryWithPerm")!=NULL)

               {

                  flag=0; //

               }

               else if(strstr(line,"response")!=NULL)

               {

                  flag=1; //

               }

               else if(strstr(line,"identifier")!=NULL)

               {

                   sscanf(line,"%s%s",identifier[0],identifier[1]);

                   //

                   if(strcmp(opc[1],localpc)==0)

                   {

                       //

                       if(flag==1) //response

                       {

                            //找主叫流程

                            updatecall(&calling_node,identifier[1]);

                            //找被叫流程

                            updatecall(&called_node,identifier[1]);

                            flag=-1;

                       }

                   }

               }

               else if(strstr(line,"originationRequest")!=NULL)

               {

                   //主叫流程

                   if(strstr(line,"originationRequestRes")==NULL)

                   {

                       in = 1;

                   }

                  

               }

               else if(strstr(line,"billingID")!=NULL)

               {

                  sscanf(line,"%s%s",billingID[0],billingID[1]);

                  if(in==1)

                  {

                     callflow * calling = call_instance(identifier[1],billingID[1] ,false);

                     dlist_ins_next(&calling_node,calling_node.tail,(void *)calling);

                     in = -1;

                  }

               }

               else if(strstr(line,"initial-Termination (38)")!=NULL)

               {

                   //被叫流程

                   callflow * called = call_instance(identifier[1],billingID[1] ,false);

                   dlist_ins_next(&called_node,called_node.tail,(void *)called);

               }

          }

       }

       else

       {

          pEnd++;

       }

     

   }

   output(&calling_node,true);

   output(&called_node,false);

   dlist_destroy(&calling_node);

   dlist_destroy(&called_node);

   munmap(pBuffer,nFileSize);

   //fclose(fp);

   close(fd);

}


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  mmap 大文件 超过4G