这里,我们得到了网页间的链接关系,可以比较方便的开始进行pagerank的计算了。首先需要预处理得到整个链接关系图。对于一个url的定义如下:
struct url
{
int id;
string urlTxt;
int outDegree;
vector<int> refList;
int inDegree;
double score[2];
bool operator > (const url& u) const
{
return score[scoreIdx]>u.score[scoreIdx];
}
};
预处理得到Url集合,为了加速运算,将每个url编号,在图中只存储每个url的编号。
void initUrl()
{
cout<<nowTime()<<" Begin init user."<<endl;
ifstream in(mapPath.c_str());
string urlStr;
int id=0;
while(in>>urlStr)
{
if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js")
continue;
if(urlId.find(urlStr)==urlId.end())
{
urlId[urlStr]=id;
url newUrl;
newUrl.id=id;
newUrl.inDegree=0;
newUrl.outDegree=0;
newUrl.urlTxt=urlStr;
urls.push_back(newUrl);
id++;
}
}
in.close();
}
得到整个图关系代码如下:
void initNet()
{
cout << nowTime() << ": Begin init net." << endl;
ifstream in(mapPath.c_str());
string fromStr,toStr,rootStr;
in>>rootStr;
while(in>>fromStr>>toStr)
{
if(links.find(fromStr+"#"+toStr)!=links.end()) continue;
else links.insert(fromStr+"#"+toStr);
if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-3)==".js")
{
continue;
}
int from=urlId[fromStr];
int to=urlId[toStr];
urls[to].refList.push_back(from);
urls[to].inDegree++;
urls[from].outDegree++;
}
in.close();
cout << nowTime() << ": Complete init net." << endl;
}
计算pagerank的代码如下:
int calPagerank()
{
cout << nowTime() << ": Begin calculate pagerank." << endl;
int oldIdx = 0;
int newIdx = 1;
int urlNum=urls.size();
for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;
for(int iteration=0; iteration <= 10000; iteration++)
{
cout<<nowTime()<<": iteration "<<iteration<<endl;
for(int i =0;i<urlNum;i++)
{
urls[i].score[newIdx] = 0;
}
double perSum = 0;
for(int i = 0; i < urlNum; i ++)
{
urls[i].score[newIdx] = sum*rem/urlNum;
if(urls[i].inDegree> 0)
{
for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
{
int id = *iter;
urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
}
}
perSum += urls[i].score[newIdx];
}
for(int i=0;i<urlNum;i++)
{
urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
}
double dif = 0;
for(int i = 0; i < urlNum; i ++)
{
dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
}
if( dif < bound )
{
break;
}
oldIdx = newIdx;
newIdx = 1 - oldIdx;
}
cout << nowTime() << ": Complete calculate pagerank." << endl;
return newIdx;
}
将得到的pagerank结果写入文件:
void pagerankWriteToFile()
{
cout << nowTime() << ": Begin write pagerank to file." << endl;
ofstream out(rankPath.c_str(), ios::trunc|ios::out);
out << setiosflags(ios::fixed) << setprecision(20);
sort(urls.begin(),urls.end(),greater<url>());
for(int i=0;i<urls.size();i++)
{
out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl;
}
out.close();
cout << nowTime() << ": Complete write pagerank to file." << endl;
}
进行完这些计算后,和同学交流,发现得到的结果出入较大。交流发现,是对出度为0的点没有处理的原因。由于我抓的图是部分的图,所以出度为0的点,实际上是和其他网络交流的一个Saddle,但是应有的pagerank并没有从这里流出去。这个出度为0的点像黑洞一样只吸收pagerank值却不支出pagerank值。所以,需要将这些出度为0的点进行特殊处理,即认为这些节点向其他每个节点都连了一条边。经过修改后计算pagerank的代码如下:
int calPagerank()
{
cout << nowTime() << ": Begin calculate pagerank." << endl;
int oldIdx = 0;
int newIdx = 1;
int urlNum=urls.size();
for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;
for(int iteration=0; iteration <= 10000; iteration++)
{
cout<<nowTime()<<": iteration "<<iteration<<endl;
for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum;
for(int i=0;i<urlNum;i++)
{
if(urls[i].outDegree!=0) continue;
for(int j=0;j<urlNum;j++)
{
if(i==j) continue;
urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1);
}
}
double perSum = 0;
for(int i = 0; i < urlNum; i ++)
{
if(urls[i].inDegree> 0)
{
for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
{
int id = *iter;
urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
}
}
perSum += urls[i].score[newIdx];
}
for(int i=0;i<urlNum;i++)
{
urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
}
double dif = 0;
for(int i = 0; i < urlNum; i ++)
{
dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
}
if( dif < bound )
{
break;
}
oldIdx = newIdx;
newIdx = 1 - oldIdx;
}
cout << nowTime() << ": Complete calculate pagerank." << endl;
return newIdx;
}
以下是程序完整的C++代码:
#include <iostream>
#include <fstream>
#include <iomanip>
#include <time.h>
#include <map>
#include <list>
#include <cstdlib>
#include <cstring>
#include <string>
#include <memory.h>
#include <algorithm>
#include <vector>
#include <cmath>
#include <set>
using namespace std;
int scoreIdx;
struct url
{
int id;
string urlTxt;
int outDegree;
vector<int> refList;
int inDegree;
double score[2];
bool operator > (const url& u) const
{
return score[scoreIdx]>u.score[scoreIdx];
}
};
map<string,int> urlId;
set<string> links;
vector<url> urls;
double sum = 1.0;
double rem = 0.15;
double bound = 0.000001;
string mapPath="linkMap.txt";
string rankPath="pagerank.txt";
string nowTime()
{
char outTime[64];
time_t t = time(0);
strftime(outTime, sizeof(outTime), "%Y/%m/%d %X", localtime(&t));
return outTime;
}
void initUrl()
{
cout<<nowTime()<<" Begin init user."<<endl;
ifstream in(mapPath.c_str());
string urlStr;
int id=0;
while(in>>urlStr)
{
if(urlStr.substr(urlStr.size()-4)==".css"||urlStr.substr(urlStr.size()-3)==".js")
{
continue;
}
if(urlId.find(urlStr)==urlId.end())
{
urlId[urlStr]=id;
url newUrl;
newUrl.id=id;
newUrl.inDegree=0;
newUrl.outDegree=0;
newUrl.urlTxt=urlStr;
urls.push_back(newUrl);
id++;
}
}
in.close();
}
void initNet()
{
cout << nowTime() << ": Begin init net." << endl;
ifstream in(mapPath.c_str());
string fromStr,toStr,rootStr;
in>>rootStr;
int cnt=0;
while(in>>fromStr>>toStr)
{
if(links.find(fromStr+"#"+toStr)!=links.end()) continue;
else links.insert(fromStr+"#"+toStr);
if(fromStr.substr(fromStr.size()-4)==".css"||toStr.substr(toStr.size()-4)==".css"||fromStr.substr(fromStr.size()-3)==".js"||toStr.substr(toStr.size()-3)==".js")
{
continue;
}
cnt++;
int from=urlId[fromStr];
int to=urlId[toStr];
urls[to].refList.push_back(from);
urls[to].inDegree++;
urls[from].outDegree++;
}
cout<<"The link number is " << cnt<<endl;
in.close();
cout << nowTime() << ": Complete init net." << endl;
}
int calPagerank()
{
cout << nowTime() << ": Begin calculate pagerank." << endl;
int oldIdx = 0;
int newIdx = 1;
int urlNum=urls.size();
for(int i=0;i<urlNum;i++) urls[i].score[oldIdx]=sum/urlNum;
for(int iteration=0; iteration <= 10000; iteration++)
{
cout<<nowTime()<<": iteration "<<iteration<<endl;
for(int i=0;i<urlNum;i++) urls[i].score[newIdx] = sum*rem/urlNum;
for(int i=0;i<urlNum;i++)
{
if(urls[i].outDegree!=0) continue;
for(int j=0;j<urlNum;j++)
{
if(i==j) continue;
urls[j].score[newIdx]+=(1-rem)*urls[i].score[oldIdx]/(urlNum-1);
}
}
double perSum = 0;
for(int i = 0; i < urlNum; i ++)
{
if(urls[i].inDegree> 0)
{
for(vector<int>::iterator iter = urls[i].refList.begin(); iter != urls[i].refList.end(); iter++)
{
int id = *iter;
urls[i].score[newIdx] += (1-rem)*urls[id].score[oldIdx]/urls[id].outDegree;
}
}
perSum += urls[i].score[newIdx];
}
for(int i=0;i<urlNum;i++)
{
urls[i].score[newIdx] = urls[i].score[newIdx] / perSum;
}
double dif = 0;
for(int i = 0; i < urlNum; i ++)
{
dif += abs(urls[i].score[newIdx] - urls[i].score[oldIdx]);
}
if( dif < bound )
{
break;
}
oldIdx = newIdx;
newIdx = 1 - oldIdx;
}
cout << nowTime() << ": Complete calculate pagerank." << endl;
return newIdx;
}
void pagerankWriteToFile()
{
cout << nowTime() << ": Begin write pagerank to file." << endl;
ofstream out(rankPath.c_str(), ios::trunc|ios::out);
out << setiosflags(ios::fixed) << setprecision(20);
sort(urls.begin(),urls.end(),greater<url>());
for(int i=0;i<urls.size();i++)
{
out<<urls[i].urlTxt<<"\t"<<urls[i].score[scoreIdx]<<"\t"<<urls[i].inDegree<<endl;
}
out.close();
cout << nowTime() << ": Complete write pagerank to file." << endl;
}
int main()
{
initUrl();
initNet();
scoreIdx = calPagerank();
pagerankWriteToFile();
return 0;
}
经过如此计算后,结果虽然有一定变化,但是变化不大。也许是数据集或者抓取过程的原因,我也没在我的代码以及他的代码中发现什么问题。如果有发现代码问题的,请留言,不胜感激。下面将对得到的结果进行介绍。
原文链接: https://www.cnblogs.com/goodness/archive/2012/04/16/2452024.html
欢迎关注
微信关注下方公众号,第一时间获取干货硬货;公众号内回复【pdf】免费获取数百本计算机经典书籍
原创文章受到原创版权保护。转载请注明出处:https://www.ccppcoding.com/archives/47648
非原创文章文中已经注明原地址,如有侵权,联系删除
关注公众号【高性能架构探索】,第一时间获取最新文章
转载文章受原作者版权保护。转载请注明原作者出处!