You are on page 1of 5

Experiment No 2

60004190123
Vidhan Shah
CS-B/B3

Aim: Design a crawler to gather web information.


Theory:
What is a web crawler?
A web crawler, also known as a web spider, robot, crawling agent or web scraper, is a
program that can serve two functions:
 Systematically browsing the web to index content for search engines. Web crawlers
copy pages for processing by a search engine, which indexes the downloaded pages
for easier retrieval so that users can get search results faster. This was the original
meaning of web crawler.
 Automatically retrieving content from any web page. This is more commonly called
web scraping. This meaning of web crawler came about as companies other than
search engines started using web scrapers to retrieve web information. For example,
e-commerce companies rely on their competitors’ prices for dynamic pricing.
What is web crawling?
Web crawling is the process of indexing data on web pages by using a program or automated
script. These automated scripts or programs are known by multiple names, including web
crawler, spider, spider bot, and often shortened to crawler.
How does a web crawler work?
 Web crawlers start their crawling process by downloading the website’s robot.txt file.
The file includes sitemaps that list the URLs that the search engine can crawl.
 Once web crawlers start crawling a page, they discover new pages via hyperlinks.
 Crawlers add newly discovered URLs to the crawl queue so that they can be crawled
later if they are interesting to be crawled for the crawler’s developer.
Thanks to this flow, web crawlers can index every single page that is connected to others.

Code & Output:


!pip install 'networkx<2.7'

import subprocess
from urllib.parse import urlparse
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
from urllib.parse import urlencode
import socket
from multiprocessing import Queue
import threading
import getopt
import sys
import os
import networkx as nx
from bs4 import BeautifulSoup
import matplotlib

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize': (7, 5), 'figure.dpi': 80})
import threading, queue
import re
import pandas as pd
import numpy as np

q = queue.Queue()
max_threads=50
next_url=queue.Queue()
crawled_urls=[]

def check_link(url):
    domain = '.'.join(urlparse(url).netloc.split('.')[-2:])
    filetype = urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]
    if  (filetype != 'pdf'):
      return True
    else:
      return False

def get_links_from_page(url):
    urllist = []
    try:
      res=urllib2.urlopen(url)
      htmlpage=res.read()
    except:
      return urllist

    try:
      page=BeautifulSoup(htmlpage)
    except:
      return urllist
    #refs=page.findAll("a")
    pattern = re.compile(r"https://www.*.com/*")
    refs = page.findAll("a", href=pattern)[25:35]
    for a in refs:
      try:
        link = a['href']
        if link[:4] == 'http':
          urllist.append(link)
      except:
        pass

    return urllist

def find_links(url_tuple,graph):
    '''Crawls to a given depth using a tuple structure to tag urls with 
their depth'''
    global crawled_urls, next_url, max_depth
    url = url_tuple[0]
    depth = url_tuple[1]
    if (depth < 2) :
      links = get_links_from_page(url)
      for link in links:
        '''These two lines create the graph'''
        graph.add_node(link)
        graph.add_edge(url,link)
        '''If the link has not been crawled yet, add it in the queue wi
th additional depth'''
        if link not in crawled_urls:
          next_url.put((link, depth+1))
          crawled_urls.append(link)
    return 

class crawler_thread(threading.Thread):
    def __init__(self,queue,graph):
      threading.Thread.__init__(self)
      self.to_be_crawled=queue
      self.graph=graph
      while self.to_be_crawled.empty() is False:
        find_links(self.to_be_crawled.get(), self.graph)

def draw_graph(graph, graph_file_name):
  '''Function to draw the graph and save the files'''
  nx.draw(graph,with_labels=False)
  nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
  plt.savefig(os.cwd()+graph_file_name+'.png')
 
def calculatePageRank(url):
  root_url = url
  parser_flag = 'beautifulsoup'
  max_depth=2

  next_url.put((root_url,0))
  crawled_urls.append(root_url)
  ip_list=[]
  g=nx.Graph()
  g.add_node(root_url)
  thread_list=[]

  for i in range(max_threads): #changed
    t=crawler_thread(next_url,g)
    t.daemon=True
    t.start()
    thread_list.append(t)

  for t in thread_list:
    t.join()

  print('Number of Crawled URI:', len(crawled_urls))
  # for url in crawled_urls:
  #   print(url)
  #   ip_list.append(socket.gethostbyname(urlparse(url).netloc))
  #   ip_set=set(ip_list)
  #   ip_list = [*ip_set, ]

  # print("Unique Host: %s " % len(ip_list))
  pagerank = nx.pagerank(g, alpha=0.85, personalization=None,  weight='
weight', dangling=None)
  
  result_keyvalpairs = pagerank.items()
  list_data = list(result_keyvalpairs)
  numpy_array = np.array(list_data)
  print(numpy_array, '\n')
  
  edgeNumber = g.number_of_edges()
  nodeNumber = g.number_of_nodes()
  nodesize=[g.degree(n)*10 for n in g]
  pos=nx.spring_layout(g,iterations=20)

  nx.draw(g,with_labels=False)
  nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
  nx.draw_networkx_edges(g,pos)
  plt.figure(figsize=(5,5))
  plt.show()
  # return np.array(len(ip_list), pagerank.get(url), edgeNumber, nodeNu
mber)
  print()
  print('Top 10 Websites:')
  for i in range(0, min(10, len(numpy_array))):
    print(i+1, '-', numpy_array[i][0])

  # return pd.Series([pagerank.get(url), edgeNumber, nodeNumber], index
=['pagerank','edges', 'nodes'])
url = 'https://www.geeksforgeeks.org/machine-learning/'
calculatePageRank(url)

Conclusion: Thus, in this experiment we understood the concept of web crawlers and
learned that they are used to gather web information by indexing pages for search engines
which enables search engines to provide relevant results for queries and also implemented it
using Python.

You might also like