You are on page 1of 2

from __future__ import absolute_import, unicode_literals

import traceback

from celery import shared_task


import locale

import requests
from bs4 import BeautifulSoup as bs

from .models import AdUrl, Image


from .data_extractor import *

def parseDetail(adUrl):
try:
page = requests.get(adUrl.url)
soup = bs(page.text, "lxml")

if adUrl.ad is None:
status = getStatus(soup)
priceString = soup.find(attrs={"class": "xxxx-large"},
name="strong").text.replace(" ", "")
price = getPrice(priceString)
currency = getCurrency(priceString)
characteristics = getCharacteristics(soup)

ad = Ad(name=getName(soup),
price=price,
currency=currency,
status=status,
startDate=getStartDate(soup),
endDate=getEndDate(status),
views=getViews(soup),
description=getDescription(soup))
ad.save()
# save images
for url in getImageUrls(soup):
Image(url=url, ad=ad).save()
adUrl.ad = ad
adUrl.save()

elif adUrl.ad.status != Ad.SOLD:


status = getStatus(soup)
adUrl.ad.status = status
adUrl.ad.endDate = getEndDate(status)
adUrl.ad.views = getViews(soup)
adUrl.save()

except Exception as e:
traceback.print_tb(e.__traceback__)

def getTotalNumberOfPages(url="https://www.olx.ro/imobiliare"):
soup = bs(requests.get(url).text, "lxml")
nr_of_pages = soup.findAll(attrs={"class": "block br3 brc8 large tdnone
lheight24"})[-1]
return int(nr_of_pages.contents[1].text)
@shared_task
def scraper(rentingURL):
pages = getTotalNumberOfPages(rentingURL)
for pageNum in range(1, pages + 1):
print("Page %d." % pageNum)
mainPage = requests.get(rentingURL + "?page=" + str(pageNum))
soup = bs(mainPage.text, "lxml")

links = soup.findAll(attrs={"class": "marginright5 link linkWithHash


detailsLink"})

for a in links:
obj, create = AdUrl.objects.get_or_create(url=a['href'])
obj.save()
# print("\tAnunt:", parseDetail(a['href']))
print("\tAnunt adaugat:", a['href'])

@shared_task
def populate_ads_details():
adUrls = AdUrl.objects.all()
locale.setlocale(locale.LC_ALL, 'ro_RO.UTF-8')
for adUrl in adUrls:
parseDetail(adUrl)

You might also like