You are on page 1of 5

Experiment No 1

60004190123
Vidhan Shah
CS-B/B3

Aim: Implementation of Page rank estimation.


Theory:
In Social Networks page rank is a very important topic. Basically page rank is nothing but
how webpages are ranked according to its importance and relevance of search. All search
engines use page ranking. Google is the best example that uses page rank using the web
graph.

PageRank works by counting the number and quality of links to a page to determine a rough
estimate of how important the website is. The underlying assumption is that more important
websites are likely to receive more links from other websites.

The algorithm steps are listed below

 Initialize the PageRank of every node with a value of 1


 For each iteration, update the PageRank of every node in the graph
 The new PageRank is the sum of the proportional rank of all of its parents
 Apply random walk to the new PageRank
 PageRank value will converge after enough iterations

Code & Output:


!pip install 'networkx<2.7'

import subprocess
from urllib.parse import urlparse
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
from urllib.parse import urlencode
import socket
from multiprocessing import Queue
import threading
import getopt
import sys
import os
import networkx as nx
from bs4 import BeautifulSoup
import matplotlib

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize': (7, 5), 'figure.dpi': 80})
import threading, queue
import re
import pandas as pd
import numpy as np

q = queue.Queue()
max_threads=50
next_url=queue.Queue()
crawled_urls=[]

def check_link(url):
    domain = '.'.join(urlparse(url).netloc.split('.')[-2:])
    filetype = urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]
    if  (filetype != 'pdf'):
      return True
    else:
      return False

def get_links_from_page(url):
    urllist = []
    try:
      res=urllib2.urlopen(url)
      htmlpage=res.read()
    except:
      return urllist

    try:
      page=BeautifulSoup(htmlpage)
    except:
      return urllist

    #refs=page.findAll("a")
    pattern = re.compile(r"https://www.*.com/*")
    refs = page.findAll("a", href=pattern)[25:35]
    for a in refs:
      try:
        link = a['href']
        if link[:4] == 'http':
          urllist.append(link)
      except:
        pass

    return urllist
def find_links(url_tuple,graph):
    '''Crawls to a given depth using a tuple structure to tag urls with 
their depth'''
    global crawled_urls, next_url, max_depth
    url = url_tuple[0]
    depth = url_tuple[1]
    if (depth < 2) :
      links = get_links_from_page(url)
      for link in links:
        '''These two lines create the graph'''
        graph.add_node(link)
        graph.add_edge(url,link)
        '''If the link has not been crawled yet, add it in the queue wi
th additional depth'''
        if link not in crawled_urls:
          next_url.put((link, depth+1))
          crawled_urls.append(link)
    return 

class crawler_thread(threading.Thread):
    def __init__(self,queue,graph):
      threading.Thread.__init__(self)
      self.to_be_crawled=queue
      self.graph=graph
      while self.to_be_crawled.empty() is False:
        find_links(self.to_be_crawled.get(), self.graph)

def draw_graph(graph, graph_file_name):
  '''Function to draw the graph and save the files'''
  nx.draw(graph,with_labels=False)
  nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
  plt.savefig(os.cwd()+graph_file_name+'.png')
 
def calculatePageRank(url):
  root_url = url
  parser_flag = 'beautifulsoup'
  max_depth=2

  next_url.put((root_url,0))
  crawled_urls.append(root_url)
  ip_list=[]
  g=nx.Graph()
  g.add_node(root_url)
  thread_list=[]

  for i in range(max_threads): #changed
    t=crawler_thread(next_url,g)
    t.daemon=True
    t.start()
    thread_list.append(t)

  for t in thread_list:
    t.join()

  print('Number of Crawled URI:', len(crawled_urls))
  # for url in crawled_urls:
  #   print(url)
  #   ip_list.append(socket.gethostbyname(urlparse(url).netloc))
  #   ip_set=set(ip_list)
  #   ip_list = [*ip_set, ]

  # print("Unique Host: %s " % len(ip_list))
  pagerank = nx.pagerank(g, alpha=0.85, personalization=None,  weight='
weight', dangling=None)
  
  result_keyvalpairs = pagerank.items()
  list_data = list(result_keyvalpairs)
  numpy_array = np.array(list_data)
  print(numpy_array, '\n')
  
  edgeNumber = g.number_of_edges()
  nodeNumber = g.number_of_nodes()
  nodesize=[g.degree(n)*10 for n in g]
  pos=nx.spring_layout(g,iterations=20)

  nx.draw(g,with_labels=False)
  nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
  nx.draw_networkx_edges(g,pos)
  plt.figure(figsize=(5,5))
  plt.show()
  # return np.array(len(ip_list), pagerank.get(url), edgeNumber, nodeNu
mber)

  print()
  print('Top 10 Websites:')
  for i in range(0, min(10, len(numpy_array))):
    print(i+1, '-', numpy_array[i][0])

  # return pd.Series([pagerank.get(url), edgeNumber, nodeNumber], index
=['pagerank','edges', 'nodes'])
url = 'https://www.geeksforgeeks.org/machine-learning/'
calculatePageRank(url)
Conclusion: Thus, in this experiment we understood the concept of Page rank estimation.
We implemented it by creating a graph from all the links and considering them as nodes, the
links or nodes that have the most edges or are pointed at the most is considered as a higher
quality link than other links and thus we found out the top 5 links using Python.

You might also like