Write some Software
$10-30 USD
着払い
I need to modify the python script below because the website changed
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
import re
import mysql_db
DUPLICATES = 0
def get_html(url):
print url
return [url removed, login to view](url, timeout=10).[url removed, login to view]('utf-8')
def get_new_dubizzle_pages():
url_tpl = '[url removed, login to view]{0}'
count = 1
while 1:
url = [url removed, login to view](str(count))
try:
trying = 0
html = get_html(url)
except:
trying = trying+1
if trying > 10:
break
else:
continue
block_html = [url removed, login to view]('fixed breakword ad_id6RSCB')[1]
soup = BeautifulSoup(block_html)
count = count + 1
blocks = soup.find_all('div', class_='d-listing__item')
for block in blocks:
row = {}
href = block.find_all("a", class_="d-listing__name")[0]["href"].split("?")[0]
row['href'] = '[url removed, login to view]' + href
row['desc'] = block.find_all("a",class_="d-listing__name")[0].[url removed, login to view]("\n").strip()
details = block.find_all("h2", class_="d-listing__cat")[0].find_all("span")
cc_text = "سى سى".decode("utf-8")
km_text = "كم".decode('utf-8')
#pdb.set_trace()
row["cc"] = ""
row["km"] = 0
row["price"] = ""
row["year"] = ""
row["model"] = ""
row["maker"] = ""
for detail in details:
if cc_text in [url removed, login to view]:
row['cc'] = [url removed, login to view](cc_text,"")
row['cc'] = [url removed, login to view]("\D", "", row['cc'])
if km_text in [url removed, login to view]:
row['km'] = [url removed, login to view](cc_text,"")
row['km'] = [url removed, login to view]("\D", "", row['km'])
if not row['km']:
row['km'] = 0
#pdb.set_trace()
try:
row['price'] = str(block.find_all("div",class_="d-listing__amount")[0]).split("<span")[0].replace(" ","").strip("\n").split("\n")[-1]
row['price'] = [url removed, login to view]("\D", "", row['price'])
except:
pass
try:
row['year'] = details[1].text
except:
pass
try:
row['model'] = " ".join(details[0].[url removed, login to view](" ")[1:])
row['maker'] = details[0].[url removed, login to view](" ")[0]
except:
pass
yield row
if len(blocks) < 3:
break
def main():
global DUPLICATES
count = 0
db = [url removed, login to view](host='[url removed, login to view]',user='HHamouda',passwd='Vjiycdm3',db='HHamouda$egcars')
#db = [url removed, login to view](host='[url removed, login to view]',user='ebinp',passwd='xqU-3xY-82H-TRK',db='contact')
for page in get_new_dubizzle_pages():
count = count + 1
if count > 2000:
break
if not page['price']:
continue
if not db.insert_post(page,'dubizzle'):
DUPLICATES = DUPLICATES + 1
print DUPLICATES
[url removed, login to view](1)
if DUPLICATES > 100:
break
if __name__ == '__main__':
main()
プロジェクトID: #10049781
プロジェクトについて
5人のフリーランサーが、平均$21 で、この仕事に入札しています。
send me a message so we can discuss about your requirements, i can help you with your project. thanks.
Do you mean that the website you are scraping changed? Could you pass the original .py script that has the indentations? With what you pasted here I would have to spend some time fixing that for it to work. I hop もっと