I am trying to implement the Amazon Web Scraper mentioned here. However, I get the output mentioned below. The output repeats until it stops with RecursionError: maximum recursion depth exceeded
. I have already tried downgrading eventlet to version 0.17.4 as mentioned here. Also, the requests
module is getting patched as you can see in helpers.py
.
helpers.py
import osimport randomfrom datetime import datetimefrom urllib.parse import urlparseimport eventletrequests = eventlet.import_patched('requests.__init__')time = eventlet.import_patched('time')import redisfrom bs4 import BeautifulSoupfrom requests.exceptions import RequestExceptionimport settingsnum_requests = 0redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)def make_request(url, return_soup=True): # global request building and response handling url = format_url(url) if "picassoRedirect" in url: return None # skip the redirect URLs global num_requests if num_requests >= settings.max_requests: raise Exception("Reached the max number of requests: {}".format(settings.max_requests)) proxies = get_proxy() try: r = requests.get(url, headers=settings.headers, proxies=proxies) except RequestException as e: log("WARNING: Request for {} failed, trying again.".format(url)) num_requests += 1 if r.status_code != 200: os.system('say "Got non-200 Response"') log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url)) return None if return_soup: return BeautifulSoup(r.text), r.text return rdef format_url(url): # make sure URLs aren't relative, and strip unnecssary query args u = urlparse(url) scheme = u.scheme or "https" host = u.netloc or "www.amazon.de" path = u.path if not u.query: query = "" else: query = "?" for piece in u.query.split("&"): k, v = piece.split("=") if k in settings.allowed_params: query += "{k}={v}&".format(**locals()) query = query[:-1] return "{scheme}://{host}{path}{query}".format(**locals())def log(msg): # global logging function if settings.log_stdout: try: print("{}: {}".format(datetime.now(), msg)) except UnicodeEncodeError: pass # squash logging errors in case of non-ascii textdef get_proxy(): # choose a proxy server to use for this request, if we need one if not settings.proxies or len(settings.proxies) == 0: return None proxy = random.choice(settings.proxies) proxy_url = "socks5://{user}:{passwd}@{ip}:{port}/".format( user=settings.proxy_user, passwd=settings.proxy_pass, ip=proxy, port=settings.proxy_port, ) return {"http": proxy_url,"https": proxy_url }if __name__ == '__main__': # test proxy server IP masking r = make_request('https://api.ipify.org?format=json', return_soup=False) print(r.text)
output
Traceback (most recent call last): File "helpers.py", line 112, in <module> r = make_request('https://api.ipify.org?format=json', return_soup=False) File "helpers.py", line 36, in make_request r = requests.get(url, headers=settings.headers, proxies=proxies) File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 76, in get return request('get', url, params=params, **kwargs) File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 61, in request return session.request(method=method, url=url, **kwargs) File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 530, in request resp = self.send(prep, **send_kwargs) File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 643, in send r = adapter.send(request, **kwargs) File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/adapters.py", line 449, in send timeout=timeout File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen chunked=chunked, File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 376, in _make_request self._validate_conn(conn) File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 994, in _validate_conn conn.connect() File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connection.py", line 300, in connect conn = self._new_conn() File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/contrib/socks.py", line 99, in _new_conn **extra_kw File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 199, in create_connection sock.connect((remote_host, remote_port)) File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 47, in wrapper return function(*args, **kwargs) File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 774, in connect super(socksocket, self).settimeout(self._timeout) File "/home/ec2-user/env/lib64/python3.7/site-packages/eventlet/greenio/base.py", line 395, in settimeout self.setblocking(True)
What might be the problem here?