目的:从数据库中抽取文章关键词,并统计这些关键词在哪些文章中出现,出现多少次。(算是词袋子模型吧),然后对每篇文章形成形成VSM模型,写成weka的数据格式,然后调用weka对文章聚类。
目前“形成此代码模型一块已经完毕”
其中词袋子的数据结构如下:
map
目前已经完成此部分的serilize(save/load)以及print 功能
include "stdafx.h"
#include
#include
#include
形成词袋子模型nt ConstructMap(map<string,vector<pair<int,int>>>&mymap)
{
vector<string>mySplit(strings);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute("select CKeyWord,ArticleId from Article order by ArticleId",NULL,adCmdText);
while(!pRst->rsEOF)
{ vector<string>wordcollection;
stringkeywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
if(keywordstr!="")
{
wordcollection=mySplit(keywordstr);
stringtempid=(_bstr_t)pRst->GetCollect("ArticleId");
intarticleid=atoi(tempid.c_str());
for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vector<pair<int,int>>::iterator it;
if(mymap[strit].empty())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[strit].push_back(mytemppair);
}
else
{
for(it=mymap[strit].begin();it!=mymap[strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
}
}
if(it==mymap[strit].end())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[strit].push_back(mytemppair);
}
}
}
}
pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
return0;
}
加载词袋子模型voidload(map<string,vector<pair<int,int>>>&mymap)
{
ifstream infile("c:\mydict.dat",ios::binary);
intlenMyMap;//保存词典长度
intlenVector;//保存每个词出现的文章数目
stringkey;//保存读出的map的键值
intarticleId;//文章标号
intcount;//在该文章中刚出现的数目
stringcomma;
stringsemicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vector<pair<int,int>>temp;
for(inti=0;i<lenVector;i++)
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp;
}
infile.close();
}
保存词袋子模型voidsave(map<string,vector<pair<int,int>>>&mymap)
{ ofstream outfile("c:\mydict.dat",ios::binary);
outfile<<mymap.size()<<endl;
map<string,vector<pair<int,int>>>::iterator it;
for(it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<""<<subit->second<<""<<";"<<"";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap));
outfile.close();
}打印词袋子模型voidprint(map<string,vector<pair<int,int>>>&mymap)
{
cout<<mymap.size()<<endl;
map<string,vector<pair<int,int>>>::iterator it;
for(it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
}
}
原文链接: https://www.cnblogs.com/finallyliuyu/archive/2010/08/25/1808300.html
欢迎关注
微信关注下方公众号,第一时间获取干货硬货;公众号内回复【pdf】免费获取数百本计算机经典书籍
原创文章受到原创版权保护。转载请注明出处:https://www.ccppcoding.com/archives/14283
非原创文章文中已经注明原地址,如有侵权,联系删除
关注公众号【高性能架构探索】,第一时间获取最新文章
转载文章受原作者版权保护。转载请注明原作者出处!