Write some Software

進行中 投稿 Mar 25, 2016 着払い
進行中 着払い

I need to modify the python script below because the website changed

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

import requests

import time

import re

import mysql_db

DUPLICATES = 0

def get_html(url):

print url

return [url removed, login to view](url, timeout=10).[url removed, login to view]('utf-8')

def get_new_dubizzle_pages():

url_tpl = '[url removed, login to view]{0}'

count = 1

while 1:

url = [url removed, login to view](str(count))

try:

trying = 0

html = get_html(url)

except:

trying = trying+1

if trying > 10:

break

else:

continue

block_html = [url removed, login to view]('fixed breakword ad_id6RSCB')[1]

soup = BeautifulSoup(block_html)

count = count + 1

blocks = soup.find_all('div', class_='d-listing__item')

for block in blocks:

row = {}

href = block.find_all("a", class_="d-listing__name")[0]["href"].split("?")[0]

row['href'] = '[url removed, login to view]' + href

row['desc'] = block.find_all("a",class_="d-listing__name")[0].[url removed, login to view]("\n").strip()

details = block.find_all("h2", class_="d-listing__cat")[0].find_all("span")

cc_text = "سى سى".decode("utf-8")

km_text = "كم".decode('utf-8')

#pdb.set_trace()

row["cc"] = ""

row["km"] = 0

row["price"] = ""

row["year"] = ""

row["model"] = ""

row["maker"] = ""

for detail in details:

if cc_text in [url removed, login to view]:

row['cc'] = [url removed, login to view](cc_text,"")

row['cc'] = [url removed, login to view]("\D", "", row['cc'])

if km_text in [url removed, login to view]:

row['km'] = [url removed, login to view](cc_text,"")

row['km'] = [url removed, login to view]("\D", "", row['km'])

if not row['km']:

row['km'] = 0

#pdb.set_trace()

try:

row['price'] = str(block.find_all("div",class_="d-listing__amount")[0]).split("<span")[0].replace(" ","").strip("\n").split("\n")[-1]

row['price'] = [url removed, login to view]("\D", "", row['price'])

except:

pass

try:

row['year'] = details[1].text

except:

pass

try:

row['model'] = " ".join(details[0].[url removed, login to view](" ")[1:])

row['maker'] = details[0].[url removed, login to view](" ")[0]

except:

pass

yield row

if len(blocks) < 3:

break

def main():

global DUPLICATES

count = 0

db = [url removed, login to view](host='[url removed, login to view]',user='HHamouda',passwd='Vjiycdm3',db='HHamouda$egcars')

#db = [url removed, login to view](host='[url removed, login to view]',user='ebinp',passwd='xqU-3xY-82H-TRK',db='contact')

for page in get_new_dubizzle_pages():

count = count + 1

if count > 2000:

break

if not page['price']:

continue

if not db.insert_post(page,'dubizzle'):

DUPLICATES = DUPLICATES + 1

print DUPLICATES

[url removed, login to view](1)

if DUPLICATES > 100:

break

if __name__ == '__main__':

main()

Python ソフトウェアアーキテクチャ ウェブ記事のスクラップ

プロジェクトID: #10049781

プロジェクトについて

5個の提案 リモートプロジェクト アクティブ Mar 26, 2016

アワード:

npip99

Hello, I have plenty of experience writing programs like this so I will be able to make the edits you need quickly. Thank you, ~Nicholas Pipitone On another note, I highly suggest you remove the passwords from your p もっと

$15 USD 1日以内
(0レビュー)
0.0

5人のフリーランサーが、平均$21 で、この仕事に入札しています。

juanzapico

send me a message so we can discuss about your requirements, i can help you with your project. thanks.

$20 USD 0日以内
(12件のレビュー)
4.2
colmenaresreina

Do you mean that the website you are scraping changed? Could you pass the original .py script that has the indentations? With what you pasted here I would have to spend some time fixing that for it to work. I hop もっと

$15 USD 1日以内
(0件のレビュー)
0.0