I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now all my records are getting classified correctly but the thing is the program is taking almost an hr to write the records into the corresponding files.What is the best way to improve performance of my code.Here is the below code.This piece of code is writing the 410k records into the corresponding files.Thank you.
fp=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
token = strtok(line,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token1 = strtok(token,"\n");
memcpy(mystr,&token1[0],strlen(token1)-1);
mystr[strlen(token1)-1] = '\0';
if( strcmp(mystr,"ok") == 0 )
counter_ok++;
else
counter_fraud++;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
prblty_ok = counter_ok/(counter_ok+counter_fraud);
prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fclose(fp);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","a");
fp3=fopen("sales_unknown_fraud_classified.txt","a");
while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
unknwn_attr1 = strtok(line1,",");
unknwn_attr2 = strtok(NULL,",");
unknwn_attr3 = strtok(NULL,",");
unknwn_attr4 = strtok(NULL,",");
unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
fp1=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
{
ok_fraud_attr1 = strtok(line,",");
ok_fraud_attr2 = strtok(NULL,",");
ok_fraud_attr3 = strtok(NULL,",");
ok_fraud_attr4 = strtok(NULL,",");
ok_fraud_attr5 = strtok(NULL,",");
ok_fraud_attr6 = strtok(NULL,",");
memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
//ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
//printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));
if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_ok_attr5++;
}
if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_fraud_attr5++;
}
}
fclose(fp1);
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if(counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
fclose(fp);
fclose(fp2);
fclose(fp3);
There are a several things I can see right away you can do, in the order I would try them:
- Stop with the repeated open-write-close, open-write-close ideology on your output files. Their names are fixed and limited. Open them all appropriately at the start of this thing, then flush-and-close when you're done.
- There are several logic-constructs that can be significantly simplified.
- Your
strlen()
rampage needs to significantly be reduced. Most decent optimizing compilers will detect the unchanged source and optimize out the subsequent calls on a known-unchanged char-ptr, so I would do this last (but honestly I'd still do it as its a bad practice to call repeated strlen()
invokes on the same data.
- ADDED AFTER CONVERSING WITH OP :You repeatedly re-parse the same data file (sales_ok_fraud.txt) over and over, once for line of data in sales_unknwn.txt. 12gB/abg-line-length is a LOT of unneeded repetitious parsing if sales_ok_fraud.txt can fit in memory. Load that data once calculate its base stats once, and use the data and stats therein for the rest of your data crunch.
Logic Reductions
You can cut out a ton of work in one place in particular, changing this:
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr5++;
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr5++;
To this:
if (strcmp(ok_fraud_attr6_str, "ok") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_ok_attr5++;
}
else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_fraud_attr5++;
}
Front-Loading sales_ok_fraud.txt
The following relies on the sanctity of the data format of your sales_ok_fraud.txt
stats file, while trying to be as pedantic as possible in validating said format. It allocates a chunk of memory large enough to hold the entire file plus-one-char to treat the entire body as a single null-term-string. That buffer is then pieced-up via the same general algorithm you had prior. The result will be a table of pointers to fixed-length char-pointer arrays that can then be used iteratively in the same place you currently (and repeatedly) open, parse, use, and throw away all that content.
// declare an array of six string pointers
typedef char *OFAttribs[6];
// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
// but will return whatever was able to be parsed up to the point of failure.
// the caller should therefore always `free()` the resulting table and data
// pointers.
size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
if (!fname || !*fname)
return 0;
// check file open for thumbs up
FILE *fp = fopen(fname, "rb");
if (!fp)
return 0;
// allocate enough memory to hold the entire file, plus a terminator
fseek(fp, 0, SEEK_END);
long len = ftell(fp);
fseek(fp, 0, SEEK_SET);
// allocate enough ram for the entire file plus terminator
OFAttribs *pTable = NULL;
size_t nTableLen = 0;
char *pTableData = malloc((len+1) * sizeof(char));
if (NULL != pTableData)
{
fread(pTableData , len, 1, fp);
pTableData[len] = 0;
}
// no longer need the file
fclose(fp);
// prime first token
char *token = strtok(pTableData, ",");
while (token)
{
// read next line of tokens
OFAttribs attribs = { NULL };
for (int i=0;i<4 && token; ++i)
{
attribs[i] = token;
token = strtok(NULL, ",");
}
// filled 0..3, set lat token and move on
if (attribs[3] && token)
{
// next-to-last entry set
attribs[4] = token;
// line enter is only terminated by newline
token = strtok(NULL, "\n");
if (token)
{
// proper format. 6 parms, 5 commas, one new-line.
attribs[5] = token;
size_t slen = strlen(token);
if (slen > 0)
{
while (isspace(token[--slen]))
token[slen] = 0;
}
// make space on the master list for another.
OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
if (NULL != tmp)
{
pTable = tmp;
memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
}
else
{ // allocation failure.
printf("Error allocating memory for expanding OKFraud data set");
exit(EXIT_FAILURE);
}
}
else
{ // not good.
printf("Invalid line format detected. Expected ok/fraud\\n");
break;
}
// next token of new line
token = strtok(NULL, ",");
}
}
// set output variables
*ppTable = pTable;
*ppTableData = pTableData;
return nTableLen;
}
Putting It Together
Incorporating everything above has the following effect on your code base:
// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);
// walk table to determine probabilities of ok and fraud states.
// note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
if (0 == strcmp("ok", okfr[i][5]))
++counter_ok;
else
++counter_fraud;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
char *unknwn_attr1 = strtok(line1,",");
char *unknwn_attr2 = strtok(NULL,",");
char *unknwn_attr3 = strtok(NULL,",");
char *unknwn_attr4 = strtok(NULL,",");
char *unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
for (size_t i=0;i<okfr_len; ++i)
{
if( strcmp(okfr[i][5], "ok") == 0 )
{
// ok case
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_ok_attr5++;
}
else // fraud case
{
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_fraud_attr5++;
}
}
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if (counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);
fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;
These are just a few ideas. There are more things in there to-be-sure, but these should help enormously in processing your file single-forward-scan with continuous output, which is about as efficient as you're going to get under these circumstances. Without question the combination of the big three: single file open+close, logic reductions and single-parse-cache'ing the sales_ok_fraud.txt file will have a huge improvement in performance, especially the first and last of these.
EDIT Assisted the OP in updating this processor to front-load the sales_ok_fraud.txt file content, thereby eliminating repeated loading, parsing, and promptly throwing out some 15000+ line of text to be parsed repeatedly (once per main-source input line). Answer above updated accordingly.
@m02ph3u5 is right. Keep your files open, take the calls to fopen and fclose out of the loops.
inputFile = fopen("sales_unknwn.txt","r");
okayFile = fopen("sales_ok_fraud.txt","r");
unknownOkayFile = fopen("sales_unknown_ok_classified.txt","a");
unknownFraudFile = fopen("sales_unknown_fraud_classified.txt","a");
// your loops go here
fclose(inputFile);
fclose(okayFile);
fclose(unknownOkayFile);
fclose(unknownFraudFile);
If it's still slow, run a sampling profiler on your app with a subset of the test data as input just to keep the turnaround fast. This will tell you where the program is spending it's time. You might be surprised. If you don't know have a profiler to use, you can do a poor man's simulation of a sampling profiler by running your app with a debugger, breaking into the debugger repeatedly and noting which function it is running in. If you see it in a certain function or on a certain line most of the time, that's probably a hotspot that you can optimize.
A few suggestions:
• Repeatedly opening files for append, closing them, and reopening them is very expensive. This is because I/O is much slower than in-memory access, and you're forcing the disk to open each file and seek to the end every time you write to it. Better to open them once at the beginning and close them at the end, unless you're concerned the program will crash and you'll lose the data you've written so far.
• You can replace the lines
memcpy(ok_fraud_attr6_str, &ok_fraud_attr6[0], strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
with
ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
and then use ok_fraud_attr6
in your tests. Since strtok
is destructive (a quick search will be worth your time to learn why it's often a bad idea to use it at all), you needn't worry about preserving the contents of line
or ok_fraud_attr6
.
• When you find yourself writing the same code over and over again, it's often a clue that your algorithm is inefficient. Instead of
if ((some_unique_test) && (a_common_test))
do_some_stuff;
if ((some_other_unique_test) && (a_common_test))
do_other_stuff;
you can write
if (a_common_test) {
if (some_unique_test)
do_some_stuff;
if (some_other_unique_test)
do_other_stuff;
}
Note, however, that only the first suggestion is likely to have a noticeable effect on the program's execution time, though they're all good habits to learn.
Jason's recommendation to use a profiler is very good advice and can't be emphasized enough. Programmers--even experienced ones--are notoriously awful at predicting where the bottlenecks in their code are. Next to a debugger, a profile is your best friend.