Har*_*son 11 python geopy flask
这可能是一个很长的镜头,但这是我得到的错误:
File "/home/MY NAME/anaconda/lib/python2.7/SocketServer.py", line 596, in process_request_thread
self.finish_request(request, client_address)
File "/home/MY NAME/anaconda/lib/python2.7/SocketServer.py", line 331, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "/home/MY NAME/anaconda/lib/python2.7/SocketServer.py", line 654, in __init__
self.finish()
File "/home/MY NAME/anaconda/lib/python2.7/SocketServer.py", line 713, in finish
self.wfile.close()
File "/home/MY NAME/anaconda/lib/python2.7/socket.py", line 283, in close
self.flush()
File "/home/MY NAME/anaconda/lib/python2.7/socket.py", line 307, in flush
self._sock.sendall(view[write_offset:write_offset+buffer_size])
error: [Errno 32] Broken pipe
Run Code Online (Sandbox Code Playgroud)
我已经构建了一个Flask应用程序,它将地址作为输入并执行一些字符串格式化,操作等,然后将它们发送Bing Maps到地理编码(通过geopy外部模块).
我正在使用此应用程序来清理非常大的数据集.该应用程序适用于通常约1,500个地址的输入(每行输入1个).我的意思是它将处理地址并将其发送Bing Maps到地理编码然后返回.在大约1,500个地址之后,应用程序变得没有响应.如果这是在我工作时发生的,我的代理人告诉我有一个tcp error.如果我在非工作计算机上,它只是不加载页面.如果我重新启动应用程序,那么它的功能非常好.因此,我被迫用大约1,000个地址批量运行我的程序(只是为了安全,因为我还不确定程序崩溃的确切数字).
有谁知道可能导致它的原因是什么?
我正在思考一些与我一起达到当天Bing API密钥限制(即30,000)的内容,但这并不准确,因为我每天很少使用超过15,000个请求.
我的第二个想法是,也许是因为我仍在使用标准的烧瓶服务器来运行我的应用程序.会改用gunicorn还是uWSGI解决这个问题?
我的第三个想法可能是它的请求数量过多.我试着在前1000个地址之后睡了15秒钟,但是没有解决任何问题.
如果有人需要进一步澄清,请告诉我.
这是我的Flask应用程序后端的代码.我从这个函数得到了输入:
@app.route("/clean", methods=['POST'])
def dothing():
addresses = request.form['addresses']
return cleanAddress(addresses)
Run Code Online (Sandbox Code Playgroud)
这是cleanAddress函数:它现在有点杂乱,所有的if语句都要检查地址中的特定拼写错误,但是我打算将这些代码转移到另一个文件中的其他函数中,只是将地址传递给那些功能,以清理它一点.
def cleanAddress(addresses):
counter = 0
# nested helper function to fix addresses such as '30 w 60th'
def check_st(address):
if 'broadway' in address:
return address
has_th_st_nd_rd = re.compile(r'(?P<number>[\d]{1,4}(th|st|nd|rd)\s)(?P<following>.*)')
has_number = has_th_st_nd_rd.search(address)
if has_number is not None:
if re.match(r'(street|st|floor)', has_number.group('following')):
return address
else:
new_address = re.sub('(?P<number>[\d]{1,4}(st|nd|rd|th)\s)', r'\g<number>street ', address, 1)
return new_address
else:
return address
addresses = addresses.split('\n')
cleaned = []
success = 0
fail = 0
cleaned.append('<body bgcolor="#FACC2E"><center><img src="http://goglobal.dhl-usa.com/common/img/dhl-express-logo.png" alt="Smiley face" height="100" width="350"><br><p>')
cleaned.append('<br><h3>Note: Everything before the first comma is the Old Address. Everything after the first comma is the New Address</h13>')
cleaned.append('<p><h3>To format the output in Excel, split the columns using "," as the delimiter. </p></h3>')
cleaned.append('<p><h2><font color="red">Old Address </font> <font color="black">New Address </font></p></h2>')
for address in addresses:
dirty = address.strip()
if ',' in address:
dirty = dirty.replace(',', '')
cleaned.append('<font color="red">' + dirty + ', ' + '</font>')
address = address.lower()
address = re.sub('[^A-Za-z0-9#]+', ' ', address).lstrip()
pattern = r"\d+.* +(\d+ .*(" + "|".join(patterns) + "))"
address = re.sub(pattern, "\\1", address)
address = check_st(address)
if 'one ' in address:
address = address.replace('one', '1')
if 'two' in address:
address = address.replace('two', '2')
if 'three' in address:
address = address.replace('three', '3')
if 'four' in address:
address = address.replace('four', '4')
if 'five' in address:
address = address.replace('five', '5')
if 'eight' in address:
address = address.replace('eight', '8')
if 'nine' in address:
address = address.replace('nine', '9')
if 'fith' in address:
address = address.replace('fith', 'fifth')
if 'aveneu' in address:
address = address.replace('aveneu', 'avenue')
if 'united states of america' in address:
address = address.replace('united states of america', '')
if 'ave americas' in address:
address = address.replace('ave americas', 'avenue of the americas')
if 'americas avenue' in address:
address = address.replace('americas avenue', 'avenue of the americas')
if 'avenue of americas' in address:
address = address.replace('avenue of americas', 'avenue of the americas')
if 'avenue of america ' in address:
address = address.replace('avenue of america ', 'avenue of the americas ')
if 'ave of the americ' in address:
address = address.replace('ave of the americ', 'avenue of the americas')
if 'avenue america' in address:
address = address.replace('avenue america', 'avenue of the americas')
if 'americaz' in address:
address = address.replace('americaz', 'americas')
if 'ave of america' in address:
address = address.replace('ave of america', 'avenue of the americas')
if 'amrica' in address:
address = address.replace('amrica', 'americas')
if 'americans' in address:
address = address.replace('americans', 'americas')
if 'walk street' in address:
address = address.replace('walk street', 'wall street')
if 'northend' in address:
address = address.replace('northend', 'north end')
if 'inth' in address:
address = address.replace('inth', 'ninth')
if 'aprk' in address:
address = address.replace('aprk', 'park')
if 'eleven' in address:
address = address.replace('eleven', '11')
if ' av ' in address:
address = address.replace(' av ', ' avenue')
if 'avnue' in address:
address = address.replace('avnue', 'avenue')
if 'ofthe americas' in address:
address = address.replace('ofthe americas', 'of the americas')
if 'aj the' in address:
address = address.replace('aj the', 'of the')
if 'fifht' in address:
address = address.replace('fifht', 'fifth')
if 'w46' in address:
address = address.replace('w46', 'w 46')
if 'w42' in address:
address = address.replace('w42', 'w 42')
if '95st' in address:
address = address.replace('95st', '95th st')
if 'e61 st' in address:
address = address.replace('e61 st', 'e 61st')
if 'driver information' in address:
address = address.replace('driver information', '')
if 'e87' in address:
address = address.replace('e87', 'e 87')
if 'thrd avenus' in address:
address = address.replace('thrd avenus', 'third avenue')
if '3r ' in address:
address = address.replace('3r ', '3rd ')
if 'st ates' in address:
address = address.replace('st ates', '')
if 'east52nd' in address:
address = address.replace('east52nd', 'east 52nd')
if 'authority to leave' in address:
address = address.replace('authority to leave', '')
if 'sreet' in address:
address = address.replace('sreet', 'street')
if 'w47' in address:
address = address.replace('w47', 'w 47')
if 'signature required' in address:
address = address.replace('signature required', '')
if 'direct' in address:
address = address.replace('direct', '')
if 'streetapr' in address:
address = address.replace('streetapr', 'street')
if 'steet' in address:
address = address.replace('steet', 'street')
if 'w39' in address:
address = address.replace('w39', 'w 39')
if 'ave of new york' in address:
address = address.replace('ave of new york', 'avenue of the americas')
if 'avenue of new york' in address:
address = address.replace('avenue of new york', 'avenue of the americas')
if 'brodway' in address:
address = address.replace('brodway', 'broadway')
if 'w 31 ' in address:
address = address.replace('w 31 ', 'w 31th ')
if 'w 34 ' in address:
address = address.replace('w 34 ', 'w 34th ')
if 'w38' in address:
address = address.replace('w38', 'w 38')
if 'broadeay' in address:
address = address.replace('broadeay', 'broadway')
if 'w37' in address:
address = address.replace('w37', 'w 37')
if '35street' in address:
address = address.replace('35street', '35th street')
if 'eighth avenue' in address:
address = address.replace('eighth avenue', '8th avenue')
if 'west 33' in address:
address = address.replace('west 33', 'west 33rd')
if '34t ' in address:
address = address.replace('34t ', '34th ')
if 'street ave' in address:
address = address.replace('street ave', 'ave')
if 'avenue of york' in address:
address = address.replace('avenue of york', 'avenue of the americas')
if 'avenue aj new york' in address:
address = address.replace('avenue aj new york', 'avenue of the americas')
if 'avenue ofthe new york' in address:
address = address.replace('avenue ofthe new york', 'avenue of the americas')
if 'e4' in address:
address = address.replace('e4', 'e 4')
if 'avenue of nueva york' in address:
address = address.replace('avenue of nueva york', 'avenue of the americas')
if 'avenue of new york' in address:
address = address.replace('avenue of new york', 'avenue of the americas')
if 'west end new york' in address:
address = address.replace('west end new york', 'west end avenue')
#print address
address = address.split(' ')
for pattern in patterns:
try:
if address[0].isdigit():
continue
else:
location = address.index(pattern) + 1
number_location = address[location]
#print address[location]
#if 'th' in address[location + 1] or 'floor' in address[location + 1] or '#' in address[location]:
# continue
except (ValueError, IndexError):
continue
if number_location.isdigit() and len(number_location) <= 4:
address = [number_location] + address[:location] + address[location+1:]
break
address = ' '.join(address)
if '#' in address:
address = address.replace('#', '')
#print (address)
i = 0
for char in address:
if char.isdigit():
address = address[i:]
break
i += 1
#print (address)
if 'plz' in address:
address = address.replace('plz', 'plaza ', 1)
if 'hstreet' in address:
address = address.replace('hstreet', 'h street')
if 'dstreet' in address:
address = address.replace('dstreet', 'd street')
if 'hst' in address:
address = address.replace('hst', 'h st')
if 'dst' in address:
address = address.replace('dst', 'd st')
if 'have' in address:
address = address.replace('have', 'h ave')
if 'dave' in address:
address = address.replace('dave', 'd ave')
if 'havenue' in address:
address = address.replace('havenue', 'h avenue')
if 'davenue' in address:
address = address.replace('davenue', 'd avenue')
#print address
regex = r'(.*)(' + '|'.join(patterns) + r')(.*)'
address = re.sub(regex, r'\1\2', address).lstrip() + " nyc"
print (address)
if 'americasas st' in address:
address = address.replace('americasas st', 'americas')
try:
clean = geolocator.geocode(address)
x = clean.address
address, city, zipcode, country = x.split(",")
address = address.lower()
if 'first' in address:
address = address.replace('first', '1st')
if 'second' in address:
address = address.replace('second', '2nd')
if 'third' in address:
address = address.replace('third', '3rd')
if 'fourth' in address:
address = address.replace('fourth', '4th')
if 'fifth' in address:
address = address.replace('fifth', '5th')
if ' sixth a' in address:
address = address.replace('ave', '')
address = address.replace('avenue', '')
address = address.replace(' sixth', ' avenue of the americas')
if ' 6th a' in address:
address = address.replace('ave', '')
address = address.replace('avenue', '')
address = address.replace(' 6th', ' avenue of the americas')
if 'seventh' in address:
address = address.replace('seventh', '7th')
if 'fashion' in address:
address = address.replace('fashion', '7th')
if 'eighth' in address:
address = address.replace('eighth', '8th')
if 'ninth' in address:
address = address.replace('ninth', '9th')
if 'tenth' in address:
address = address.replace('tenth', '10th')
if 'eleventh' in address:
address = address.replace('eleventh', '11th')
zipcode = zipcode[3:]
to_write = str(address) + ", " + str(zipcode.lstrip()) + ", " + str(clean.latitude) + ", " + str(clean.longitude)
to_find = str(address)
#print to_write
# returns 'can not be cleaned' if street address has no numbers
if any(i.isdigit() for i in str(address)):
with open('/home/MY NAME/Address_Database.txt', 'a+') as database:
if to_find not in database.read():
database.write(dirty + '|' + to_write + '\n')
if 'ncy rd' in address:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
elif 'nye rd' in address:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
elif 'nye c' in address:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
else:
cleaned.append(to_write + '<br>')
success += 1
else:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
except AttributeError:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
except ValueError:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
except GeocoderTimedOut as e:
cleaned.append('<font color="red"> Can not be cleaned </font> <br>')
fail += 1
total = success + fail
percent = float(success) / float(total) * 100
percent = round(percent, 2)
print percent
cleaned.append('<br>Accuracy: ' + str(percent) + ' %')
cleaned.append('</p></center></body>')
return "\n".join(cleaned)
Run Code Online (Sandbox Code Playgroud)
更新:我已经切换到使用gunicorn运行应用程序,这是在我从家庭网络访问应用程序时解决问题,但是,我仍然从我的工作代理收到TCP错误.我在控制台中没有收到任何错误消息,浏览器只显示TCP错误.我可以告诉大家,该工具仍然在后台工作,因为我已经在环告诉我,每个地址仍在地理编码打印语句.难道这是一起我的工作网络的不顺心,页面加载仍然为很长一段时间线的东西,然后就显示该代理错误页面?
听起来好像文件句柄用完了(普通用户的默认限制为 1024),您可以通过运行grep 'open' /proc/<webapp pid>限制和ls -1 /proc/<pid>/fd | wc -l当前打开的文件句柄来检查。
我认为您的代码没有发送正确的响应,这导致连接保持打开状态,最终用完打开的文件句柄(打开的套接字是 posix 系统上的文件)。
netstat -an | grep <webapp port>当您看到问题时,可以确认连接处于什么状态。它应该有 1k+ IP 和端口及其状态的列表。
猜测它们所处的TIME_WAIT状态表明客户端没有正确关闭连接,并且由内核稍后进行垃圾收集。
尝试:
from flask import make_response
@app.route("/clean", methods=['POST'])
def dothing():
addresses = request.form['addresses']
resp = make_response(cleanAddress(addresses), 200)
return resp
Run Code Online (Sandbox Code Playgroud)