基于tf idf的文档集关键词提取
已经含有测试文档集
可以替换成任意需要的文档集
可以自己提供字典
代码片段和文件信息
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-04-10 20:25 Tfidf_Calculate
文件 4456 2015-05-28 00:17 Tfidf_CalculateDirInput.cpp
文件 940 2015-05-28 00:18 Tfidf_Calculatemain.cpp
目录 0 2015-04-11 11:46 Tfidf_Calculatemingw5
文件 3268 2003-07-21 19:40 Tfidf_Calculatemingw5(1).txt
文件 5626 2015-04-15 22:41 Tfidf_Calculatemingw5(1)Out.txt
文件 998 2003-07-21 19:40 Tfidf_Calculatemingw5(10).txt
文件 1698 2015-04-15 22:41 Tfidf_Calculatemingw5(10)Out.txt
文件 1341 2003-07-21 19:40 Tfidf_Calculatemingw5(100).txt
文件 2283 2015-04-15 22:41 Tfidf_Calculatemingw5(100)Out.txt
文件 699 2003-07-21 19:40 Tfidf_Calculatemingw5(101).txt
文件 1241 2015-04-15 22:41 Tfidf_Calculatemingw5(101)Out.txt
文件 963 2003-07-21 19:40 Tfidf_Calculatemingw5(102).txt
文件 1651 2015-04-15 22:41 Tfidf_Calculatemingw5(102)Out.txt
文件 3045 2003-07-21 19:40 Tfidf_Calculatemingw5(103).txt
文件 5183 2015-04-15 22:41 Tfidf_Calculatemingw5(103)Out.txt
文件 785 2003-07-21 19:40 Tfidf_Calculatemingw5(104).txt
文件 1339 2015-04-15 22:41 Tfidf_Calculatemingw5(104)Out.txt
文件 814 2003-07-21 19:40 Tfidf_Calculatemingw5(105).txt
文件 1442 2015-04-15 22:41 Tfidf_Calculatemingw5(105)Out.txt
文件 1190 2003-07-21 19:40 Tfidf_Calculatemingw5(106).txt
文件 2168 2015-04-15 22:41 Tfidf_Calculatemingw5(106)Out.txt
文件 1265 2003-07-21 19:40 Tfidf_Calculatemingw5(107).txt
文件 2209 2015-04-15 22:41 Tfidf_Calculatemingw5(107)Out.txt
文件 1157 2003-07-21 19:40 Tfidf_Calculatemingw5(108).txt
文件 2001 2015-04-15 22:41 Tfidf_Calculatemingw5(108)Out.txt
文件 1195 2003-07-21 19:40 Tfidf_Calculatemingw5(109).txt
文件 2011 2015-04-15 22:41 Tfidf_Calculatemingw5(109)Out.txt
文件 788 2003-07-21 19:40 Tfidf_Calculatemingw5(11).txt
文件 1400 2015-04-15 22:41 Tfidf_Calculatemingw5(11)Out.txt
文件 1000 2003-07-21 19:40 Tfidf_Calculatemingw5(110).txt
............此处省略308个文件信息
///////////////////////////////////////////////////////////////////
// File :Dir txt Input
// Author :ShuanHolmes
// Date :2015.4.10
// Modifier :...
// Modify Date :...
// Description :statics_Dir.cpp
///////////////////////////////////////////////////////////////////
#include “Statics.h“
extern map< string int > Dic;
extern list< string > SinStatics;
extern multiset< string > SumStatics;
extern multiset< string > Fileidf;
extern set< Word > Database;
list< WordIDF > DataOut;
void getJustCurrentFile( string path vector& files)
{ // return file iter
long hFile = 0; // file info
struct _finddata_t fileinfo;
string p;
if((hFile = _findfirst(p.assign(path).append(“\*“).c_str()&fileinfo)) != -1)
{
do
{
if((fileinfo.attrib & _A_SUBDIR));
else
files.push_back(fileinfo.name);
}while(_findnext(hFile &fileinfo) == 0);
_findclose(hFile);
}
}
void WordFrequency( void )
{
SinStatics.unique();
while(!SinStatics.empty())
{
SumStatics.insert(SinStatics.back( ));
SinStatics.pop_back( );
}
SinStatics.clear(); // register clear
}
void DatabaseConstruction( float N ) // the file group
{
multiset< string >::iterator it;
Word temp;
for(it = SumStatics.begin(); it != SumStatics.end(); it++ )
{
temp.wordfrequency = fabs(log(N/(float)SumStatics.count(*it))/log(2));
temp.word = *it;
if(Database.find(temp)==Database.end())
{
Database.insert(temp);
}
}
SinStatics.clear();
SumStatics.clear();
Dic.clear();
}
void TfidfFileInput(char *filename) // segment the sentence store the real words
{
ifstream testfile(filename);
string testsentence;
string testword;
if (!testfile)
cerr << “Fail to open “ << filename << endl;
else
cout << “Succeed to open “ << filename << endl;
cout << “Please wait “<< filename << “ segmenting the sentences in test file!“ << endl;
while(!testfile.eof())
{
getline(testfiletestsentence‘
‘);
string result_temp=““;
int result_len = 0;
string sentence_temp=testsentence;
int cur_sen_length=testsentence.length();
int len1len2;
while(sentence_temp!=““)
{
len1 = sentence_temp.length();
len2 = sentence_temp.length();
if(len2 > MaxWordLength) // MaxLength
len2 = MaxWordLength;
testword = sentence_temp.substr(len1-len2);
bool isw = TFidfWordCheck( testword );
while(len2 > 2 && isw == false)
{
len2 = len2-2; // 2 Byte 1 word
testword = sentence_temp.substr(len1-len2);
isw = TFidfWordCheck( testword );
}
if(result_temp == ““)
result_temp=testword+result_temp; // continue
else
result_temp=testword+“ “+result_temp; // cut
sentence_temp=sentence_temp.substr(0len1-len2); // next sentence
}
}
testfile.close();
}
bool TFidfWordCheck(string test_word) // whether t
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2015-04-10 20:25 Tfidf_Calculate
文件 4456 2015-05-28 00:17 Tfidf_CalculateDirInput.cpp
文件 940 2015-05-28 00:18 Tfidf_Calculatemain.cpp
目录 0 2015-04-11 11:46 Tfidf_Calculatemingw5
文件 3268 2003-07-21 19:40 Tfidf_Calculatemingw5(1).txt
文件 5626 2015-04-15 22:41 Tfidf_Calculatemingw5(1)Out.txt
文件 998 2003-07-21 19:40 Tfidf_Calculatemingw5(10).txt
文件 1698 2015-04-15 22:41 Tfidf_Calculatemingw5(10)Out.txt
文件 1341 2003-07-21 19:40 Tfidf_Calculatemingw5(100).txt
文件 2283 2015-04-15 22:41 Tfidf_Calculatemingw5(100)Out.txt
文件 699 2003-07-21 19:40 Tfidf_Calculatemingw5(101).txt
文件 1241 2015-04-15 22:41 Tfidf_Calculatemingw5(101)Out.txt
文件 963 2003-07-21 19:40 Tfidf_Calculatemingw5(102).txt
文件 1651 2015-04-15 22:41 Tfidf_Calculatemingw5(102)Out.txt
文件 3045 2003-07-21 19:40 Tfidf_Calculatemingw5(103).txt
文件 5183 2015-04-15 22:41 Tfidf_Calculatemingw5(103)Out.txt
文件 785 2003-07-21 19:40 Tfidf_Calculatemingw5(104).txt
文件 1339 2015-04-15 22:41 Tfidf_Calculatemingw5(104)Out.txt
文件 814 2003-07-21 19:40 Tfidf_Calculatemingw5(105).txt
文件 1442 2015-04-15 22:41 Tfidf_Calculatemingw5(105)Out.txt
文件 1190 2003-07-21 19:40 Tfidf_Calculatemingw5(106).txt
文件 2168 2015-04-15 22:41 Tfidf_Calculatemingw5(106)Out.txt
文件 1265 2003-07-21 19:40 Tfidf_Calculatemingw5(107).txt
文件 2209 2015-04-15 22:41 Tfidf_Calculatemingw5(107)Out.txt
文件 1157 2003-07-21 19:40 Tfidf_Calculatemingw5(108).txt
文件 2001 2015-04-15 22:41 Tfidf_Calculatemingw5(108)Out.txt
文件 1195 2003-07-21 19:40 Tfidf_Calculatemingw5(109).txt
文件 2011 2015-04-15 22:41 Tfidf_Calculatemingw5(109)Out.txt
文件 788 2003-07-21 19:40 Tfidf_Calculatemingw5(11).txt
文件 1400 2015-04-15 22:41 Tfidf_Calculatemingw5(11)Out.txt
文件 1000 2003-07-21 19:40 Tfidf_Calculatemingw5(110).txt
............此处省略308个文件信息
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。
评论列表(条)