How to improve the performance while operating wit

2019-05-23 03:57发布

站内文章 / C

45 0

太酷不给撩

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now all my records are getting classified correctly but the thing is the program is taking almost an hr to write the records into the corresponding files.What is the best way to improve performance of my code.Here is the below code.This piece of code is writing the 410k records into the corresponding files.Thank you.

    fp=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                token = strtok(line,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token1 = strtok(token,"\n");
                memcpy(mystr,&token1[0],strlen(token1)-1);
                mystr[strlen(token1)-1] = '\0';

        if( strcmp(mystr,"ok") == 0 )
            counter_ok++;
        else 
        counter_fraud++;       
    }
    printf("The no. of records with OK label are %f\n",counter_ok);
    printf("The no. of records with FRAUD label are %f\n",counter_fraud);

    prblty_ok = counter_ok/(counter_ok+counter_fraud);
    prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
    printf("The probability of OK records is %f\n",prblty_ok);
    printf("The probability of FRAUD records is %f\n",prblty_fraud);
    fclose(fp);

    fp=fopen("sales_unknwn.txt","r");
    fp2=fopen("sales_unknown_ok_classified.txt","a");
    fp3=fopen("sales_unknown_fraud_classified.txt","a");
    while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                unknwn_attr1 = strtok(line1,",");
                unknwn_attr2 = strtok(NULL,",");
                unknwn_attr3 = strtok(NULL,",");
                unknwn_attr4 = strtok(NULL,",");
                unknwn_attr5 = strtok(NULL,",");

        //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

        fp1=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
            {
            ok_fraud_attr1 = strtok(line,",");
                    ok_fraud_attr2 = strtok(NULL,",");
                    ok_fraud_attr3 = strtok(NULL,",");
                    ok_fraud_attr4 = strtok(NULL,",");
                    ok_fraud_attr5 = strtok(NULL,",");
            ok_fraud_attr6 = strtok(NULL,",");
                    memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
                    ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
            //ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';      
            //printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str)); 
            if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
            {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_ok_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_ok_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_ok_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_ok_attr5++;
            }

                    if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
                        {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_fraud_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_fraud_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_fraud_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_fraud_attr5++;
            }
        }
        fclose(fp1);
        if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

        if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

        if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

        if(counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

        if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

        if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

        if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

        if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

        total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
        total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


//      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
//      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
//      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
//      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
//      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

        if(total_prblty_ok > total_prblty_fraud)
        {
            fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }   
        else
        {
            fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }
        counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
        counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
    }

    fclose(fp);
        fclose(fp2);    
        fclose(fp3);

回答1:

There are a several things I can see right away you can do, in the order I would try them:

Stop with the repeated open-write-close, open-write-close ideology on your output files. Their names are fixed and limited. Open them all appropriately at the start of this thing, then flush-and-close when you're done.
There are several logic-constructs that can be significantly simplified.
Your strlen() rampage needs to significantly be reduced. Most decent optimizing compilers will detect the unchanged source and optimize out the subsequent calls on a known-unchanged char-ptr, so I would do this last (but honestly I'd still do it as its a bad practice to call repeated strlen() invokes on the same data.
ADDED AFTER CONVERSING WITH OP :You repeatedly re-parse the same data file (sales_ok_fraud.txt) over and over, once for line of data in sales_unknwn.txt. 12gB/abg-line-length is a LOT of unneeded repetitious parsing if sales_ok_fraud.txt can fit in memory. Load that data once calculate its base stats once, and use the data and stats therein for the rest of your data crunch.

Logic Reductions

You can cut out a ton of work in one place in particular, changing this:

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr5++;

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr5++;

To this:

    if (strcmp(ok_fraud_attr6_str, "ok") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_ok_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
            counter_ok_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_ok_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_ok_attr5++;
    }
    else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_fraud_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
            counter_fraud_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_fraud_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_fraud_attr5++;
    }

Front-Loading sales_ok_fraud.txt

The following relies on the sanctity of the data format of your sales_ok_fraud.txt stats file, while trying to be as pedantic as possible in validating said format. It allocates a chunk of memory large enough to hold the entire file plus-one-char to treat the entire body as a single null-term-string. That buffer is then pieced-up via the same general algorithm you had prior. The result will be a table of pointers to fixed-length char-pointer arrays that can then be used iteratively in the same place you currently (and repeatedly) open, parse, use, and throw away all that content.

// declare an array of six string pointers
typedef char *OFAttribs[6];

// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
//  but will return whatever was able to be parsed up to the point of failure.
//  the caller should therefore always `free()` the resulting table and data
//  pointers.

size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
    if (!fname || !*fname)
        return 0;

    // check file open for thumbs up
    FILE *fp = fopen(fname, "rb");
    if (!fp)
        return 0;

    // allocate enough memory to hold the entire file, plus a terminator
    fseek(fp, 0,  SEEK_END);
    long len = ftell(fp);
    fseek(fp, 0,  SEEK_SET);

    // allocate enough ram for the entire file plus terminator
    OFAttribs *pTable = NULL;
    size_t nTableLen = 0;
    char *pTableData =  malloc((len+1) * sizeof(char));
    if (NULL != pTableData)
    {
        fread(pTableData , len, 1, fp);
        pTableData[len] = 0;
    }

    // no longer need the file
    fclose(fp);

    // prime first token
    char *token = strtok(pTableData, ",");
    while (token)
    {
        // read next line of tokens
        OFAttribs attribs = { NULL };
        for (int i=0;i<4 && token; ++i)
        {
            attribs[i] = token;
            token = strtok(NULL, ",");
        }

        // filled 0..3, set lat token and move on
        if (attribs[3] && token)
        {
            // next-to-last entry set
            attribs[4] = token;

            // line enter is only terminated by newline
            token = strtok(NULL, "\n");
            if (token)
            {
                // proper format. 6 parms, 5 commas, one new-line.
                attribs[5] = token;
                size_t slen = strlen(token);
                if (slen > 0)
                {
                    while (isspace(token[--slen]))
                        token[slen] = 0;
                }

                // make space on the master list for another.
                OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
                if (NULL != tmp)
                {
                    pTable = tmp;
                    memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
                }
                else
                {   // allocation failure.
                    printf("Error allocating memory for expanding OKFraud data set");
                    exit(EXIT_FAILURE);
                }
            }
            else
            {   // not good.
                printf("Invalid line format detected. Expected ok/fraud\\n");
                break;
            }

            // next token of new line
            token = strtok(NULL, ",");
        }
    }

    // set output variables
    *ppTable = pTable;
    *ppTableData = pTableData;
    return nTableLen;
}

Putting It Together

Incorporating everything above has the following effect on your code base:

// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);

// walk table to determine probabilities of ok and fraud states.
//  note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
    if (0 == strcmp("ok", okfr[i][5]))
        ++counter_ok;
    else
        ++counter_fraud;
}

printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);

// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);

fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
    char *unknwn_attr1 = strtok(line1,",");
    char *unknwn_attr2 = strtok(NULL,",");
    char *unknwn_attr3 = strtok(NULL,",");
    char *unknwn_attr4 = strtok(NULL,",");
    char *unknwn_attr5 = strtok(NULL,",");

    //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

    for (size_t i=0;i<okfr_len; ++i)
    {

        if( strcmp(okfr[i][5], "ok") == 0 )
        {
            // ok case
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_ok_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_ok_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_ok_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_ok_attr5++;
        }

        else // fraud case
        {
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_fraud_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_fraud_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_fraud_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_fraud_attr5++;
        }
    }

    if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

    if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

    if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

    if (counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

    if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

    if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

    if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

    if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

    total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
    total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


    //      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
    //      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
    //      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
    //      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
    //      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

    if(total_prblty_ok > total_prblty_fraud)
    {
        fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    else
    {
        fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
    counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}

// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);

fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;

These are just a few ideas. There are more things in there to-be-sure, but these should help enormously in processing your file single-forward-scan with continuous output, which is about as efficient as you're going to get under these circumstances. Without question the combination of the big three: single file open+close, logic reductions and single-parse-cache'ing the sales_ok_fraud.txt file will have a huge improvement in performance, especially the first and last of these.

EDIT Assisted the OP in updating this processor to front-load the sales_ok_fraud.txt file content, thereby eliminating repeated loading, parsing, and promptly throwing out some 15000+ line of text to be parsed repeatedly (once per main-source input line). Answer above updated accordingly.

回答2:

@m02ph3u5 is right. Keep your files open, take the calls to fopen and fclose out of the loops.

inputFile = fopen("sales_unknwn.txt","r");
okayFile = fopen("sales_ok_fraud.txt","r");
unknownOkayFile = fopen("sales_unknown_ok_classified.txt","a");
unknownFraudFile = fopen("sales_unknown_fraud_classified.txt","a");

// your loops go here

fclose(inputFile);
fclose(okayFile);
fclose(unknownOkayFile);
fclose(unknownFraudFile);

If it's still slow, run a sampling profiler on your app with a subset of the test data as input just to keep the turnaround fast. This will tell you where the program is spending it's time. You might be surprised. If you don't know have a profiler to use, you can do a poor man's simulation of a sampling profiler by running your app with a debugger, breaking into the debugger repeatedly and noting which function it is running in. If you see it in a certain function or on a certain line most of the time, that's probably a hotspot that you can optimize.

回答3:

A few suggestions:

• Repeatedly opening files for append, closing them, and reopening them is very expensive. This is because I/O is much slower than in-memory access, and you're forcing the disk to open each file and seek to the end every time you write to it. Better to open them once at the beginning and close them at the end, unless you're concerned the program will crash and you'll lose the data you've written so far.

• You can replace the lines

memcpy(ok_fraud_attr6_str, &ok_fraud_attr6[0], strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';

with

ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';

and then use ok_fraud_attr6 in your tests. Since strtok is destructive (a quick search will be worth your time to learn why it's often a bad idea to use it at all), you needn't worry about preserving the contents of line or ok_fraud_attr6.

• When you find yourself writing the same code over and over again, it's often a clue that your algorithm is inefficient. Instead of

if ((some_unique_test) && (a_common_test))
  do_some_stuff;
if ((some_other_unique_test) && (a_common_test))
  do_other_stuff;

you can write

if (a_common_test) {
  if (some_unique_test)
    do_some_stuff;
  if (some_other_unique_test)
    do_other_stuff;
}

Note, however, that only the first suggestion is likely to have a noticeable effect on the program's execution time, though they're all good habits to learn.

Jason's recommendation to use a profiler is very good advice and can't be emphasized enough. Programmers--even experienced ones--are notoriously awful at predicting where the bottlenecks in their code are. Next to a debugger, a profile is your best friend.