Professional Documents
Culture Documents
WI Exp2
WI Exp2
60004190123
Vidhan Shah
CS-B/B3
import subprocess
from urllib.parse import urlparse
try:
import urllib.request as urllib2
except ImportError:
import urllib2
from urllib.parse import urlencode
import socket
from multiprocessing import Queue
import threading
import getopt
import sys
import os
import networkx as nx
from bs4 import BeautifulSoup
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize': (7, 5), 'figure.dpi': 80})
import threading, queue
import re
import pandas as pd
import numpy as np
q = queue.Queue()
max_threads=50
next_url=queue.Queue()
crawled_urls=[]
def check_link(url):
domain = '.'.join(urlparse(url).netloc.split('.')[-2:])
filetype = urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]
if (filetype != 'pdf'):
return True
else:
return False
def get_links_from_page(url):
urllist = []
try:
res=urllib2.urlopen(url)
htmlpage=res.read()
except:
return urllist
try:
page=BeautifulSoup(htmlpage)
except:
return urllist
#refs=page.findAll("a")
pattern = re.compile(r"https://www.*.com/*")
refs = page.findAll("a", href=pattern)[25:35]
for a in refs:
try:
link = a['href']
if link[:4] == 'http':
urllist.append(link)
except:
pass
return urllist
def find_links(url_tuple,graph):
'''Crawls to a given depth using a tuple structure to tag urls with
their depth'''
global crawled_urls, next_url, max_depth
url = url_tuple[0]
depth = url_tuple[1]
if (depth < 2) :
links = get_links_from_page(url)
for link in links:
'''These two lines create the graph'''
graph.add_node(link)
graph.add_edge(url,link)
'''If the link has not been crawled yet, add it in the queue wi
th additional depth'''
if link not in crawled_urls:
next_url.put((link, depth+1))
crawled_urls.append(link)
return
class crawler_thread(threading.Thread):
def __init__(self,queue,graph):
threading.Thread.__init__(self)
self.to_be_crawled=queue
self.graph=graph
while self.to_be_crawled.empty() is False:
find_links(self.to_be_crawled.get(), self.graph)
def draw_graph(graph, graph_file_name):
'''Function to draw the graph and save the files'''
nx.draw(graph,with_labels=False)
nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
plt.savefig(os.cwd()+graph_file_name+'.png')
def calculatePageRank(url):
root_url = url
parser_flag = 'beautifulsoup'
max_depth=2
next_url.put((root_url,0))
crawled_urls.append(root_url)
ip_list=[]
g=nx.Graph()
g.add_node(root_url)
thread_list=[]
for i in range(max_threads): #changed
t=crawler_thread(next_url,g)
t.daemon=True
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
print('Number of Crawled URI:', len(crawled_urls))
# for url in crawled_urls:
# print(url)
# ip_list.append(socket.gethostbyname(urlparse(url).netloc))
# ip_set=set(ip_list)
# ip_list = [*ip_set, ]
# print("Unique Host: %s " % len(ip_list))
pagerank = nx.pagerank(g, alpha=0.85, personalization=None, weight='
weight', dangling=None)
result_keyvalpairs = pagerank.items()
list_data = list(result_keyvalpairs)
numpy_array = np.array(list_data)
print(numpy_array, '\n')
edgeNumber = g.number_of_edges()
nodeNumber = g.number_of_nodes()
nodesize=[g.degree(n)*10 for n in g]
pos=nx.spring_layout(g,iterations=20)
nx.draw(g,with_labels=False)
nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
nx.draw_networkx_edges(g,pos)
plt.figure(figsize=(5,5))
plt.show()
# return np.array(len(ip_list), pagerank.get(url), edgeNumber, nodeNu
mber)
print()
print('Top 10 Websites:')
for i in range(0, min(10, len(numpy_array))):
print(i+1, '-', numpy_array[i][0])
# return pd.Series([pagerank.get(url), edgeNumber, nodeNumber], index
=['pagerank','edges', 'nodes'])
url = 'https://www.geeksforgeeks.org/machine-learning/'
calculatePageRank(url)
Conclusion: Thus, in this experiment we understood the concept of web crawlers and
learned that they are used to gather web information by indexing pages for search engines
which enables search engines to provide relevant results for queries and also implemented it
using Python.