A recent data acquisition brought forth the requirement to process fixed-width text files that comprise the data. This would not have been much of a discussion point were it not for the fact that some of the files were huge - 60Gb in one case. Most of these large files comprise the space character, serving as padding for the fixed-width fields; this serves to illustrate how inefficient fixed-width text files are, but that is not the point we're making here today.
Jupiter A planet Andromeda A "nearby" galaxy Sirius A star Eros An asteroid, a rocky body Titan A moon...we used texttocsv.exe to create the following CSV (test.csv):-
Jupiter,A planet Andromeda,"A ""nearby"" galaxy" Sirius,A star Eros,"An asteroid, a rocky body" Titan,A moonNote that the "Andromeda" row has the quotes around "nearby" correctly escaped.
texttocsv.exe test.csv test.txt 20,50
#include #include #include //class to hold the array of column widths class CWidthList { private: unsigned int* m_Column_Width; unsigned int m_Num_Cols; unsigned int m_Total_Width; //prevent new from being used - force //any instance to be on the stack void * operator new (size_t); void * operator new[] (size_t); public: //return width of column number (zero based) //returns 0 if column number was invalid unsigned int GetWidth(unsigned int column_number) { if(column_number>=0 && column_numberm_Column_Width[column_number]); } return 0; } //return total width of columns unsigned int GetTotalWidth(void) { return(this->m_Total_Width); } //return the number of columns unsigned int GetNumCols(void) { return(this->m_Num_Cols); } //parse the widths string //returns false if out of memory bool ParseWidths(const char* p_widths) { unsigned int total = 0; unsigned int i=0; unsigned int len=strlen(p_widths); //count number of commas in the width string this->m_Num_Cols=1; while(im_Num_Cols++; } i++; } //create the buffer for the column widths try { this->m_Column_Width=new unsigned int[this->m_Num_Cols]; } catch (std::bad_alloc) { //out of memory for the buffer return false; } //set all widths to 0 i=0; while(i { m_Column_Width[i]=0; i++; } char val[8]; int valpos=0; int w; i=0; unsigned int cur_col=0; while(im_Column_Width[cur_col]=w; total+=w; cur_col++; valpos=0; } else if(c>='0' && cm_Column_Width[cur_col] = w; total += w; this->m_Total_Width=total; return true; } }; //simple 8 bit character buffer class class CCharBuffer { private: char* m_Data; unsigned int m_Max_Chars; //maximum number of //chars allowed unsigned int m_Pos; //current write pos //prevent new from being used - //force any instance to be on the stack void * operator new (size_t); void * operator new[] (size_t); public: //constructor CCharBuffer() { try { //allocate num chars plus an extra this->m_Max_Chars = 32; this->m_Data=new char[this->m_Max_Chars+1]; } catch (std::bad_alloc) { //out of memory when creating the buffer //so mark the buffer as not created this->m_Data=NULL; this->m_Max_Chars = 0; } this->m_Pos=0; } //destructor ~CCharBuffer() { //free the allocated buffer if(this->m_Data!=NULL) { delete this->m_Data; this->m_Data=NULL; } } //return false if failed to allocate a buffer bool CheckSpace(const unsigned int num_chars) { if(num_chars m_Max_Chars) { return true; } char* new_dest = NULL; try { //allocate num chars plus an extra new_dest = new char[num_chars+1]; } catch (std::bad_alloc) { //out of memory when resizing destination buffer return false; } if(this->m_Pos>0 && this->m_Data!=NULL) { //copy the existing data into the new buffer memcpy(new_dest,this->m_Data,this->m_Pos); } if(this->m_Data!=NULL) { delete this->m_Data; //delete the OLD buffer //(if it existed) } this->m_Data=new_dest; //and use the new one this->m_Max_Chars=num_chars; return true; } //specify the current position void SetPos(const unsigned int val) { this->m_Pos=val; //ensure poos if(valm_Pos=0; } else if(val>=this->m_Max_Chars) { this->m_Pos=this->m_Max_Chars-1; } } const unsigned int GetPos(void) { return this->m_Pos; } //make space and add a character //returns false if failed to make space bool Add(const char c) { if(CheckSpace(this->m_Pos+1)==false) { return false; } this->m_Data[this->m_Pos]=c; this->m_Pos++; return true; } //make space and add a string bool Add(const char* src,const unsigned int num_chars) { if(CheckSpace(this->m_Pos+num_chars)==false) { return false; } memcpy(this->m_Data+this->m_Pos,src,num_chars); this->m_Pos+=num_chars; return true; } //read pointer to the buffer - only valid while the //instance is in scope char* Read(void) { return this->m_Data; } }; //class to process a fixed width text file into a CSV class CTextToCSV { public: //error codes enum ErrCode { None = 0, OutOfMem, FileNotFound, FileOpenForWriteFailed }; private: //members FILE* m_Dest; //destination (CSV) file FILE* m_Src; //source text file CCharBuffer m_Src_Buffer; CCharBuffer m_Dest_Buffer; CWidthList m_Width; unsigned int m_Start_Col; //start column (optional) protected: //prevent new from being used - force any //instance to be on the stack void * operator new (size_t); void * operator new[] (size_t); //private nethods private: //read field into m_Dest_Buffer //returns OutOfMem if failed to resize dest buffer // ErrCode ReadField(const int curpos,const int width) { //first, scan src to get the trimmed extents //and to discover if comma is present int start=curpos; int end=curpos+width; //read const char* src_buf=this->m_Src_Buffer.Read(); while(startstart) { if(src_buf[end]!=0x20) { //non space found break; } end--; } //start and end are inclusive bool enclose_in_commas = false; int i=start; while(im_Dest_Buffer.Add( src_buf+start,bytes_to_copy)==false) { //insufficient space in the destination buffer return OutOfMem; } } else { //enclose in quotes and escape any double quote character //add opening quotes if(this->m_Dest_Buffer.Add('"')==false) { //insufficient space in the destination buffer return OutOfMem; } //copy all characters and escape any double quote while(startm_Dest_Buffer.Add('"'); this->m_Dest_Buffer.Add('"'); } else { //simply add the character if(this->m_Dest_Buffer.Add(src_buf[start])==false) { //out of memory return OutOfMem; } } start++; } //add closing quotes if(this->m_Dest_Buffer.Add('"')==false) { return OutOfMem; } } return None; } //process each row //returns ErrCode (normally None) ErrCode ProcessRow(void) { this->m_Dest_Buffer.SetPos(0); //if a CR is found, terminate the src buffer before it char* src_buf=this->m_Src_Buffer.Read(); char* sp=strstr(src_buf,"\r"); if(sp!=NULL) { sp[0]=0; } //pad the src buffer with spaces int len=strlen(src_buf); int pad_len=this->m_Width.GetTotalWidth()-len; if(pad_len>0) { //pad with spaces sp=src_buf + len; memset(sp,0x20,pad_len); } //read each field unsigned int x=0; int curpos=0; if(this->m_Start_Col>0 && this->m_Start_Colm_Width.GetNumCols()) { //specified start column is valid while(x < this->m_Start_Col) { curpos += this->m_Width.GetWidth(x); x++; } } while(x < this->m_Width.GetNumCols()) { if(ReadField(curpos, this->m_Width.GetWidth(x))==OutOfMem) { //failed to read a field due to memory failure return OutOfMem; } x++; //add a comma UNLESS this is the last field if(xm_Width.GetNumCols()) { if(this->m_Dest_Buffer.Add(',')==false) { //insufficient space in the destination buffer return OutOfMem; } //add the width of previous column curpos += this->m_Width.GetWidth(x-1); } } fwrite(this->m_Dest_Buffer.Read(), 1, this->m_Dest_Buffer.GetPos(), this->m_Dest); fwrite("\r\n",1,2,this->m_Dest); return None; } //close files void Close(void) { //close src file if(this->m_Src!=NULL) { fclose(this->m_Src); this->m_Src=NULL; } //close dest file if(this->m_Dest!=NULL) { fclose(this->m_Dest); this->m_Dest=NULL; } } protected: //process progress report as each row is read virtual void Progress(unsigned int row_num) { } public: //constructor CTextToCSV() { this->m_Dest = NULL; this->m_Src = NULL; } //destructor ~CTextToCSV() { //close open files and free allocated buffers Close(); } ErrCode Process(const char* p_dest_file, const char* p_src_file, const char* p_widths, const char* p_start_col) { this->m_Start_Col=0; //parse the widths string if(this->m_Width.ParseWidths(p_widths)==false) { return OutOfMem; } //ensure the src buffer has //sufficient space to read total width if(this->m_Src_Buffer.CheckSpace( this->m_Width.GetTotalWidth()*2)==false) //ensure src buffer min size { return OutOfMem; } //open source file this->m_Src=fopen(p_src_file,"rb"); if(this->m_Src==NULL) { //failed to open src file return FileNotFound; } //open destination file this->m_Dest=fopen(p_dest_file,"wb"); if(this->m_Dest==NULL) { //failed to open dest file return FileOpenForWriteFailed; } //read start column number if set if(p_start_col!=NULL) { this->m_Start_Col=atoi(p_start_col); } unsigned int row=0; //row counter while(1==1) { void* result=fgets(this->m_Src_Buffer.Read(), this->m_Width.GetTotalWidth()*2, this->m_Src); if(result==NULL) { break; } row++; ErrCode err = ProcessRow(); if(err!=None) { return err; } Progress(row); } //and close files and buffers Close(); return None; } }; //class derived from CTextToCSV to allow //bespoke progress handling class MyProcess : public CTextToCSV { public: protected: //process progress report as each row is read void Progress(unsigned int row_num) { if((row_num%10000)==0) { printf("Row %d\r\n",row_num); } } }; int main(int argc, char* argv[]) { printf("TextToCSV Version 1.0.0.1 (c) 2014\r\n\r\n"); //read params int num_param=argc; if(num_param { printf("parameters:-\r\n\r\n"); printf("dest filename (e.g. mydata.csv)\r\n"); printf("source filename (e.g. mydata.txt\r\n"); printf("column widths (e.g. 10,10,20,30,50\r\n"); printf("start column position (optional, 0 based)\r\n"); return(0); } const char* src=NULL; const char* dest=NULL; const char* widths=NULL; const char* start_col=NULL; int i=1; while(i { const char* pr=(const char*)argv[i]; //assign each paramater if(pr) { if(dest==NULL) { dest=pr; } else if(src==NULL) { src = pr; } else if(widths==NULL) { widths = pr; } else if(start_col==NULL) { start_col = pr; } } i++; } if(src == NULL) { printf("Missing source filename"); return -1; } if(dest == NULL) { printf("Missing dest filename"); return -1; } if(widths == NULL) { printf("Missing column widths"); return -1; } printf("Processing file %s into file %s\r\n\r\n",src,dest); //an instance of our class, derived from CTextToCSV //note that this instance is created on the stack which is simpler and //safer than using new and delete. // MyProcess curpos; //and process the file MyProcess::ErrCode err = curpos.Process(dest,src,widths,start_col); //read error code if and print a report to console if(err!=MyProcess::None) { switch(err) { case MyProcess::OutOfMem: printf("Error: out of memory\r\n\r\n"); break; case MyProcess::FileNotFound: printf("Error: source file not found\r\n\r\n"); break; case MyProcess::FileOpenForWriteFailed: printf("Error: unable to open destination file\r\n\r\n"); break; default: break; } } else { printf("process completed, no errors\r\n"); } return 0; }
Please refresh the page and try again.