Tej*_*eja 2 c machine-learning data-mining
我已经在一个410k行的大型数据集上实现了Naive Bayes算法.现在我的所有记录都被正确分类,但事情是程序花了差不多一小时将记录写入相应的文件.什么是改进的最佳方法我的代码的性能.这是下面的代码.这段代码是将410k记录写入相应的文件.谢谢.
fp=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
token = strtok(line,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token1 = strtok(token,"\n");
memcpy(mystr,&token1[0],strlen(token1)-1);
mystr[strlen(token1)-1] = '\0';
if( strcmp(mystr,"ok") == 0 )
counter_ok++;
else
counter_fraud++;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
prblty_ok = counter_ok/(counter_ok+counter_fraud);
prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fclose(fp);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","a");
fp3=fopen("sales_unknown_fraud_classified.txt","a");
while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
unknwn_attr1 = strtok(line1,",");
unknwn_attr2 = strtok(NULL,",");
unknwn_attr3 = strtok(NULL,",");
unknwn_attr4 = strtok(NULL,",");
unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
fp1=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
{
ok_fraud_attr1 = strtok(line,",");
ok_fraud_attr2 = strtok(NULL,",");
ok_fraud_attr3 = strtok(NULL,",");
ok_fraud_attr4 = strtok(NULL,",");
ok_fraud_attr5 = strtok(NULL,",");
ok_fraud_attr6 = strtok(NULL,",");
memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
//ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
//printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));
if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_ok_attr5++;
}
if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_fraud_attr5++;
}
}
fclose(fp1);
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if(counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
fclose(fp);
fclose(fp2);
fclose(fp3);
Run Code Online (Sandbox Code Playgroud)
我可以按照我尝试的顺序立即看到一些我能看到的东西:
strlen()
横冲直撞需要大幅减少.最好的优化编译器将检测未更改的源并优化后续调用已知未更改的char-ptr,所以我最后会这样做(但老实说,我仍然这样做,因为它是一个不好的做法,调用重复strlen()
调用相同的数据.逻辑缩减
你可以在一个地方减少大量的工作,改变这个:
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr5++;
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr5++;
Run Code Online (Sandbox Code Playgroud)
对此:
if (strcmp(ok_fraud_attr6_str, "ok") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_ok_attr5++;
}
else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_fraud_attr5++;
}
Run Code Online (Sandbox Code Playgroud)
前装 sales_ok_fraud.txt
以下内容依赖于您的sales_ok_fraud.txt
统计文件数据格式的神圣性,同时尽量在验证所述格式时尽可能迂腐.它分配了一大块足够大的内存来保存整个文件plus-one-char,将整个主体视为单个null-term-string.然后通过与之前相同的通用算法拼接该缓冲区.结果将是一个指向固定长度char指针数组的指针表,然后可以在当前(并重复)打开,解析,使用和丢弃所有内容的同一位置迭代使用.
// declare an array of six string pointers
typedef char *OFAttribs[6];
// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
// but will return whatever was able to be parsed up to the point of failure.
// the caller should therefore always `free()` the resulting table and data
// pointers.
size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
if (!fname || !*fname)
return 0;
// check file open for thumbs up
FILE *fp = fopen(fname, "rb");
if (!fp)
return 0;
// allocate enough memory to hold the entire file, plus a terminator
fseek(fp, 0, SEEK_END);
long len = ftell(fp);
fseek(fp, 0, SEEK_SET);
// allocate enough ram for the entire file plus terminator
OFAttribs *pTable = NULL;
size_t nTableLen = 0;
char *pTableData = malloc((len+1) * sizeof(char));
if (NULL != pTableData)
{
fread(pTableData , len, 1, fp);
pTableData[len] = 0;
}
// no longer need the file
fclose(fp);
// prime first token
char *token = strtok(pTableData, ",");
while (token)
{
// read next line of tokens
OFAttribs attribs = { NULL };
for (int i=0;i<4 && token; ++i)
{
attribs[i] = token;
token = strtok(NULL, ",");
}
// filled 0..3, set lat token and move on
if (attribs[3] && token)
{
// next-to-last entry set
attribs[4] = token;
// line enter is only terminated by newline
token = strtok(NULL, "\n");
if (token)
{
// proper format. 6 parms, 5 commas, one new-line.
attribs[5] = token;
size_t slen = strlen(token);
if (slen > 0)
{
while (isspace(token[--slen]))
token[slen] = 0;
}
// make space on the master list for another.
OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
if (NULL != tmp)
{
pTable = tmp;
memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
}
else
{ // allocation failure.
printf("Error allocating memory for expanding OKFraud data set");
exit(EXIT_FAILURE);
}
}
else
{ // not good.
printf("Invalid line format detected. Expected ok/fraud\\n");
break;
}
// next token of new line
token = strtok(NULL, ",");
}
}
// set output variables
*ppTable = pTable;
*ppTableData = pTableData;
return nTableLen;
}
Run Code Online (Sandbox Code Playgroud)
把它放在一起
合并上述所有内容会对您的代码库产生以下影响:
// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);
// walk table to determine probabilities of ok and fraud states.
// note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
if (0 == strcmp("ok", okfr[i][5]))
++counter_ok;
else
++counter_fraud;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
char *unknwn_attr1 = strtok(line1,",");
char *unknwn_attr2 = strtok(NULL,",");
char *unknwn_attr3 = strtok(NULL,",");
char *unknwn_attr4 = strtok(NULL,",");
char *unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
for (size_t i=0;i<okfr_len; ++i)
{
if( strcmp(okfr[i][5], "ok") == 0 )
{
// ok case
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_ok_attr5++;
}
else // fraud case
{
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_fraud_attr5++;
}
}
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if (counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);
fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;
Run Code Online (Sandbox Code Playgroud)
这些只是一些想法.还有更多的东西在里面待确定,但这些都应有助于极大地处理您的文件单正向扫描与连续输出,这大约是一样有效,你会在这种情况下得到的.毫无疑问,三大组合:单文件打开+关闭,逻辑缩减和单解析缓存sales_ok_fraud.txt文件将在性能上有很大的提升,尤其是第一个和最后一个.
编辑协助OP更新此处理器以预先加载sales_ok_fraud.txt文件内容,从而消除重复加载,解析并立即丢弃要重复解析的15000多行文本(每个主源输入行一次).以上答案相应更新.
归档时间: |
|
查看次数: |
216 次 |
最近记录: |