Tfidf词频计算


基于tf idf的文档集关键词提取 已经含有测试文档集 可以替换成任意需要的文档集 可以自己提供字典
资源截图
代码片段和文件信息
///////////////////////////////////////////////////////////////////
// File          :Dir txt Input
// Author        :ShuanHolmes
// Date          :2015.4.10
// Modifier      :...
// Modify Date   :...
// Description   :statics_Dir.cpp
///////////////////////////////////////////////////////////////////
#include “Statics.h“ 

extern map< string int > Dic;
extern list< string > SinStatics;
extern multiset< string > SumStatics;
extern multiset< string > Fileidf;
extern set< Word > Database;
list< WordIDF > DataOut;

void getJustCurrentFile( string path vector& files)  
{    // return file iter    
long  hFile  =  0;    // file info   
struct _finddata_t fileinfo;    
string p;    
if((hFile = _findfirst(p.assign(path).append(“\*“).c_str()&fileinfo)) != -1)    
{      
do     
{         
if((fileinfo.attrib & _A_SUBDIR));               
else               
files.push_back(fileinfo.name);             
}while(_findnext(hFile &fileinfo) == 0);      
_findclose(hFile);    
}  
}

void WordFrequency( void )
{
SinStatics.unique();
while(!SinStatics.empty())
{
SumStatics.insert(SinStatics.back( ));
SinStatics.pop_back( );
}
SinStatics.clear(); // register clear
}

void DatabaseConstruction( float N ) // the file group 
{
multiset< string >::iterator it;
Word temp;
for(it = SumStatics.begin(); it != SumStatics.end(); it++ )
{
temp.wordfrequency = fabs(log(N/(float)SumStatics.count(*it))/log(2));
temp.word = *it;
if(Database.find(temp)==Database.end())
{
Database.insert(temp);
}
}
SinStatics.clear();
SumStatics.clear();
Dic.clear();
}

void TfidfFileInput(char *filename) // segment the sentence  store the real words
{
ifstream testfile(filename);
string testsentence;
string testword;
if (!testfile)
cerr << “Fail to open “ << filename << endl;
else
cout << “Succeed to open “ << filename << endl;
cout << “Please wait “<< filename << “ segmenting the sentences in test file!“ << endl;
while(!testfile.eof())
{
getline(testfiletestsentence‘
‘);
string result_temp=““;
int result_len = 0;
string sentence_temp=testsentence;
int cur_sen_length=testsentence.length();
int len1len2;
while(sentence_temp!=““)
{
len1 = sentence_temp.length();
len2 = sentence_temp.length();
if(len2 > MaxWordLength) // MaxLength 
len2 = MaxWordLength;
testword = sentence_temp.substr(len1-len2);
bool isw = TFidfWordCheck( testword );
while(len2 > 2 && isw == false)
{
len2 = len2-2; // 2 Byte 1 word
testword = sentence_temp.substr(len1-len2);
isw = TFidfWordCheck( testword );
}
if(result_temp == ““)
result_temp=testword+result_temp; // continue
else
result_temp=testword+“  “+result_temp; // cut
sentence_temp=sentence_temp.substr(0len1-len2); // next sentence
}
}
testfile.close();
}

bool TFidfWordCheck(string test_word) // whether t

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-04-10 20:25  Tfidf_Calculate
     文件        4456  2015-05-28 00:17  Tfidf_CalculateDirInput.cpp
     文件         940  2015-05-28 00:18  Tfidf_Calculatemain.cpp
     目录           0  2015-04-11 11:46  Tfidf_Calculatemingw5
     文件        3268  2003-07-21 19:40  Tfidf_Calculatemingw5(1).txt
     文件        5626  2015-04-15 22:41  Tfidf_Calculatemingw5(1)Out.txt
     文件         998  2003-07-21 19:40  Tfidf_Calculatemingw5(10).txt
     文件        1698  2015-04-15 22:41  Tfidf_Calculatemingw5(10)Out.txt
     文件        1341  2003-07-21 19:40  Tfidf_Calculatemingw5(100).txt
     文件        2283  2015-04-15 22:41  Tfidf_Calculatemingw5(100)Out.txt
     文件         699  2003-07-21 19:40  Tfidf_Calculatemingw5(101).txt
     文件        1241  2015-04-15 22:41  Tfidf_Calculatemingw5(101)Out.txt
     文件         963  2003-07-21 19:40  Tfidf_Calculatemingw5(102).txt
     文件        1651  2015-04-15 22:41  Tfidf_Calculatemingw5(102)Out.txt
     文件        3045  2003-07-21 19:40  Tfidf_Calculatemingw5(103).txt
     文件        5183  2015-04-15 22:41  Tfidf_Calculatemingw5(103)Out.txt
     文件         785  2003-07-21 19:40  Tfidf_Calculatemingw5(104).txt
     文件        1339  2015-04-15 22:41  Tfidf_Calculatemingw5(104)Out.txt
     文件         814  2003-07-21 19:40  Tfidf_Calculatemingw5(105).txt
     文件        1442  2015-04-15 22:41  Tfidf_Calculatemingw5(105)Out.txt
     文件        1190  2003-07-21 19:40  Tfidf_Calculatemingw5(106).txt
     文件        2168  2015-04-15 22:41  Tfidf_Calculatemingw5(106)Out.txt
     文件        1265  2003-07-21 19:40  Tfidf_Calculatemingw5(107).txt
     文件        2209  2015-04-15 22:41  Tfidf_Calculatemingw5(107)Out.txt
     文件        1157  2003-07-21 19:40  Tfidf_Calculatemingw5(108).txt
     文件        2001  2015-04-15 22:41  Tfidf_Calculatemingw5(108)Out.txt
     文件        1195  2003-07-21 19:40  Tfidf_Calculatemingw5(109).txt
     文件        2011  2015-04-15 22:41  Tfidf_Calculatemingw5(109)Out.txt
     文件         788  2003-07-21 19:40  Tfidf_Calculatemingw5(11).txt
     文件        1400  2015-04-15 22:41  Tfidf_Calculatemingw5(11)Out.txt
     文件        1000  2003-07-21 19:40  Tfidf_Calculatemingw5(110).txt
............此处省略308个文件信息

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。

发表评论

评论列表(条)