A recent data acquisition brought forth the requirement to process fixed-width text files that comprise the data. This would not have been much of a discussion point were it not for the fact that some of the files were huge - 60Gb in one case. Most of these large files comprise the space character, serving as padding for the fixed-width fields; this serves to illustrate how inefficient fixed-width text files are, but that is not the point we're making here today.
Jupiter A planet Andromeda A "nearby" galaxy Sirius A star Eros An asteroid, a rocky body Titan A moon...we used texttocsv.exe to create the following CSV (test.csv):-
Jupiter,A planet Andromeda,"A ""nearby"" galaxy" Sirius,A star Eros,"An asteroid, a rocky body" Titan,A moonNote that the "Andromeda" row has the quotes around "nearby" correctly escaped.
texttocsv.exe test.csv test.txt 20,50
#include
#include
#include
//class to hold the array of column widths
class CWidthList
{
private:
unsigned int* m_Column_Width;
unsigned int m_Num_Cols;
unsigned int m_Total_Width;
//prevent new from being used - force
//any instance to be on the stack
void * operator new (size_t);
void * operator new[] (size_t);
public:
//return width of column number (zero based)
//returns 0 if column number was invalid
unsigned int GetWidth(unsigned int column_number)
{
if(column_number>=0 && column_numberm_Column_Width[column_number]);
}
return 0;
}
//return total width of columns
unsigned int GetTotalWidth(void)
{
return(this->m_Total_Width);
}
//return the number of columns
unsigned int GetNumCols(void)
{
return(this->m_Num_Cols);
}
//parse the widths string
//returns false if out of memory
bool ParseWidths(const char* p_widths)
{
unsigned int total = 0;
unsigned int i=0;
unsigned int len=strlen(p_widths);
//count number of commas in the width string
this->m_Num_Cols=1;
while(im_Num_Cols++;
}
i++;
}
//create the buffer for the column widths
try
{
this->m_Column_Width=new unsigned int[this->m_Num_Cols];
}
catch (std::bad_alloc)
{
//out of memory for the buffer
return false;
}
//set all widths to 0
i=0;
while(i {
m_Column_Width[i]=0;
i++;
}
char val[8];
int valpos=0;
int w;
i=0;
unsigned int cur_col=0;
while(im_Column_Width[cur_col]=w;
total+=w;
cur_col++;
valpos=0;
}
else if(c>='0' && cm_Column_Width[cur_col] = w;
total += w;
this->m_Total_Width=total;
return true;
}
};
//simple 8 bit character buffer class
class CCharBuffer
{
private:
char* m_Data;
unsigned int m_Max_Chars; //maximum number of
//chars allowed
unsigned int m_Pos; //current write pos
//prevent new from being used -
//force any instance to be on the stack
void * operator new (size_t);
void * operator new[] (size_t);
public:
//constructor
CCharBuffer()
{
try
{
//allocate num chars plus an extra
this->m_Max_Chars = 32;
this->m_Data=new char[this->m_Max_Chars+1];
}
catch (std::bad_alloc)
{
//out of memory when creating the buffer
//so mark the buffer as not created
this->m_Data=NULL;
this->m_Max_Chars = 0;
}
this->m_Pos=0;
}
//destructor
~CCharBuffer()
{
//free the allocated buffer
if(this->m_Data!=NULL)
{
delete this->m_Data;
this->m_Data=NULL;
}
}
//return false if failed to allocate a buffer
bool CheckSpace(const unsigned int num_chars)
{
if(num_chars m_Max_Chars)
{
return true;
}
char* new_dest = NULL;
try
{
//allocate num chars plus an extra
new_dest = new char[num_chars+1];
}
catch (std::bad_alloc)
{
//out of memory when resizing destination buffer
return false;
}
if(this->m_Pos>0 && this->m_Data!=NULL)
{
//copy the existing data into the new buffer
memcpy(new_dest,this->m_Data,this->m_Pos);
}
if(this->m_Data!=NULL)
{
delete this->m_Data; //delete the OLD buffer
//(if it existed)
}
this->m_Data=new_dest; //and use the new one
this->m_Max_Chars=num_chars;
return true;
}
//specify the current position
void SetPos(const unsigned int val)
{
this->m_Pos=val;
//ensure poos
if(valm_Pos=0;
}
else if(val>=this->m_Max_Chars)
{
this->m_Pos=this->m_Max_Chars-1;
}
}
const unsigned int GetPos(void)
{
return this->m_Pos;
}
//make space and add a character
//returns false if failed to make space
bool Add(const char c)
{
if(CheckSpace(this->m_Pos+1)==false)
{
return false;
}
this->m_Data[this->m_Pos]=c;
this->m_Pos++;
return true;
}
//make space and add a string
bool Add(const char* src,const unsigned int num_chars)
{
if(CheckSpace(this->m_Pos+num_chars)==false)
{
return false;
}
memcpy(this->m_Data+this->m_Pos,src,num_chars);
this->m_Pos+=num_chars;
return true;
}
//read pointer to the buffer - only valid while the
//instance is in scope
char* Read(void)
{
return this->m_Data;
}
};
//class to process a fixed width text file into a CSV
class CTextToCSV
{
public:
//error codes
enum ErrCode
{
None = 0,
OutOfMem,
FileNotFound,
FileOpenForWriteFailed
};
private:
//members
FILE* m_Dest; //destination (CSV) file
FILE* m_Src; //source text file
CCharBuffer m_Src_Buffer;
CCharBuffer m_Dest_Buffer;
CWidthList m_Width;
unsigned int m_Start_Col; //start column (optional)
protected:
//prevent new from being used - force any
//instance to be on the stack
void * operator new (size_t);
void * operator new[] (size_t);
//private nethods
private:
//read field into m_Dest_Buffer
//returns OutOfMem if failed to resize dest buffer
//
ErrCode ReadField(const int curpos,const int width)
{
//first, scan src to get the trimmed extents
//and to discover if comma is present
int start=curpos;
int end=curpos+width;
//read
const char* src_buf=this->m_Src_Buffer.Read();
while(startstart)
{
if(src_buf[end]!=0x20)
{
//non space found
break;
}
end--;
}
//start and end are inclusive
bool enclose_in_commas = false;
int i=start;
while(im_Dest_Buffer.Add(
src_buf+start,bytes_to_copy)==false)
{
//insufficient space in the destination buffer
return OutOfMem;
}
}
else
{
//enclose in quotes and escape any double quote character
//add opening quotes
if(this->m_Dest_Buffer.Add('"')==false)
{
//insufficient space in the destination buffer
return OutOfMem;
}
//copy all characters and escape any double quote
while(startm_Dest_Buffer.Add('"');
this->m_Dest_Buffer.Add('"');
}
else
{
//simply add the character
if(this->m_Dest_Buffer.Add(src_buf[start])==false)
{
//out of memory
return OutOfMem;
}
}
start++;
}
//add closing quotes
if(this->m_Dest_Buffer.Add('"')==false)
{
return OutOfMem;
}
}
return None;
}
//process each row
//returns ErrCode (normally None)
ErrCode ProcessRow(void)
{
this->m_Dest_Buffer.SetPos(0);
//if a CR is found, terminate the src buffer before it
char* src_buf=this->m_Src_Buffer.Read();
char* sp=strstr(src_buf,"\r");
if(sp!=NULL)
{
sp[0]=0;
}
//pad the src buffer with spaces
int len=strlen(src_buf);
int pad_len=this->m_Width.GetTotalWidth()-len;
if(pad_len>0)
{
//pad with spaces
sp=src_buf + len;
memset(sp,0x20,pad_len);
}
//read each field
unsigned int x=0;
int curpos=0;
if(this->m_Start_Col>0 &&
this->m_Start_Colm_Width.GetNumCols())
{
//specified start column is valid
while(x < this->m_Start_Col)
{
curpos += this->m_Width.GetWidth(x);
x++;
}
}
while(x < this->m_Width.GetNumCols())
{
if(ReadField(curpos,
this->m_Width.GetWidth(x))==OutOfMem)
{
//failed to read a field due to memory failure
return OutOfMem;
}
x++;
//add a comma UNLESS this is the last field
if(xm_Width.GetNumCols())
{
if(this->m_Dest_Buffer.Add(',')==false)
{
//insufficient space in the destination buffer
return OutOfMem;
}
//add the width of previous column
curpos += this->m_Width.GetWidth(x-1);
}
}
fwrite(this->m_Dest_Buffer.Read(),
1,
this->m_Dest_Buffer.GetPos(),
this->m_Dest);
fwrite("\r\n",1,2,this->m_Dest);
return None;
}
//close files
void Close(void)
{
//close src file
if(this->m_Src!=NULL)
{
fclose(this->m_Src);
this->m_Src=NULL;
}
//close dest file
if(this->m_Dest!=NULL)
{
fclose(this->m_Dest);
this->m_Dest=NULL;
}
}
protected:
//process progress report as each row is read
virtual void Progress(unsigned int row_num)
{
}
public:
//constructor
CTextToCSV()
{
this->m_Dest = NULL;
this->m_Src = NULL;
}
//destructor
~CTextToCSV()
{
//close open files and free allocated buffers
Close();
}
ErrCode Process(const char* p_dest_file,
const char* p_src_file,
const char* p_widths,
const char* p_start_col)
{
this->m_Start_Col=0;
//parse the widths string
if(this->m_Width.ParseWidths(p_widths)==false)
{
return OutOfMem;
}
//ensure the src buffer has
//sufficient space to read total width
if(this->m_Src_Buffer.CheckSpace(
this->m_Width.GetTotalWidth()*2)==false)
//ensure src buffer min size
{
return OutOfMem;
}
//open source file
this->m_Src=fopen(p_src_file,"rb");
if(this->m_Src==NULL)
{
//failed to open src file
return FileNotFound;
}
//open destination file
this->m_Dest=fopen(p_dest_file,"wb");
if(this->m_Dest==NULL)
{
//failed to open dest file
return FileOpenForWriteFailed;
}
//read start column number if set
if(p_start_col!=NULL)
{
this->m_Start_Col=atoi(p_start_col);
}
unsigned int row=0; //row counter
while(1==1)
{
void* result=fgets(this->m_Src_Buffer.Read(),
this->m_Width.GetTotalWidth()*2,
this->m_Src);
if(result==NULL)
{
break;
}
row++;
ErrCode err = ProcessRow();
if(err!=None)
{
return err;
}
Progress(row);
}
//and close files and buffers
Close();
return None;
}
};
//class derived from CTextToCSV to allow
//bespoke progress handling
class MyProcess : public CTextToCSV
{
public:
protected:
//process progress report as each row is read
void Progress(unsigned int row_num)
{
if((row_num%10000)==0)
{
printf("Row %d\r\n",row_num);
}
}
};
int main(int argc, char* argv[])
{
printf("TextToCSV Version 1.0.0.1 (c) 2014\r\n\r\n");
//read params
int num_param=argc;
if(num_param {
printf("parameters:-\r\n\r\n");
printf("dest filename (e.g. mydata.csv)\r\n");
printf("source filename (e.g. mydata.txt\r\n");
printf("column widths (e.g. 10,10,20,30,50\r\n");
printf("start column position (optional, 0 based)\r\n");
return(0);
}
const char* src=NULL;
const char* dest=NULL;
const char* widths=NULL;
const char* start_col=NULL;
int i=1;
while(i {
const char* pr=(const char*)argv[i];
//assign each paramater
if(pr)
{
if(dest==NULL)
{
dest=pr;
}
else if(src==NULL)
{
src = pr;
}
else if(widths==NULL)
{
widths = pr;
}
else if(start_col==NULL)
{
start_col = pr;
}
}
i++;
}
if(src == NULL)
{
printf("Missing source filename");
return -1;
}
if(dest == NULL)
{
printf("Missing dest filename");
return -1;
}
if(widths == NULL)
{
printf("Missing column widths");
return -1;
}
printf("Processing file %s into file %s\r\n\r\n",src,dest);
//an instance of our class, derived from CTextToCSV
//note that this instance is created on the stack which is simpler and
//safer than using new and delete.
//
MyProcess curpos;
//and process the file
MyProcess::ErrCode err = curpos.Process(dest,src,widths,start_col);
//read error code if and print a report to console
if(err!=MyProcess::None)
{
switch(err)
{
case MyProcess::OutOfMem:
printf("Error: out of memory\r\n\r\n");
break;
case MyProcess::FileNotFound:
printf("Error: source file not found\r\n\r\n");
break;
case MyProcess::FileOpenForWriteFailed:
printf("Error: unable to open destination file\r\n\r\n");
break;
default:
break;
}
}
else
{
printf("process completed, no errors\r\n");
}
return 0;
}
Please refresh the page and try again.