R. *_*ate 6 python web-crawler web-scraping python-3.x python-requests
I\xe2\x80\x99m 试图制作一个抓取工具,返回欧洲机场之间每日航班的数据,以获取欧洲航空公司的列表。对于荷航,可以通过单击地图上的点在以下网站上找到数据(数据显示在地图下的表格中):\n https://www.flightradar24.com/data/airlines/kl-荷航/航线
\n\n我目前有以下代码:
\n\nimport requests\nimport json\nimport datetime\nimport pandas as pd\n\nmyProxy = {"http" : "http://10.120.118.49:8080", "https" : "https://10.120.118.49:8080"}\nheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}\n\neu_airports = [\'AAL\', \'AAR\', \'ABZ\', \'ACE\', \'ADA\', \'ADB\', \'AER\', \'AES\', \'AGP\', \'AHO\', \'AJA\', \'ALA\', \'ALC\', \'AMS\',\n \'ANR\', \'AOI\', \'ARN\', \'ATH\', \'AYT\', \'BCN\', \'BDS\', \'BEG\', \'BER\', \'BES\', \'BFS\', \'BGO\', \'BGY\', \'BHD\', \n \'BHX\', \'BIA\', \'BIO\', \'BIQ\', \'BJV\', \'BLL\', \'BLQ\', \'BMA\', \'BOD\', \'BOJ\', \'BOO\', \'BRE\', \'BRI\', \'BRN\', \n \'BRQ\', \'BRS\', \'BRU\', \'BTS\', \'BUD\', \'BVA\', \'CAG\', \'CDG\', \'CFU\', \'CGN\', \'CHQ\', \'CIA\', \'CIY\', \'CLJ\', \n \'CPH\', \'CRL\', \'CTA\', \'CWL\', \'DBV\', \'DEB\', \'DLM\', \'DME\', \'DRS\', \'DTM\', \'DUB\', \'DUS\', \'EDI\', \'EGC\', \n \'EIN\', \'EMA\', \'ESB\', \'EVN\', \'FAO\', \'FCO\', \'FDH\', \'FKB\', \'FLR\', \'FMM\', \'FMO\', \'FNC\', \'FRA\', \'FSC\', \n \'FUE\', \'GDN\', \'GLA\', \'GOA\', \'GOT\', \'GRO\', \'GRQ\', \'GRZ\', \'GVA\', \'GYD\', \'HAJ\', \'HAM\', \'HAU\', \'HEL\', \n \'HER\', \'HHN\', \'HUY\', \'IAS\', \'IBZ\', \'IEV\', \'INI\', \'INN\', \'IST\', \'JER\', \'JMK\', \'JTR\', \'KBP\', \'KEF\', \n \'KGS\', \'KIR\', \'KIV\', \'KLU\', \'KRK\', \'KRS\', \'KTW\', \'KUN\', \'LBA\', \'LCA\', \'LCY\', \'LED\', \'LEI\', \'LEJ\', \n \'LGG\', \'LGW\', \'LHR\', \'LIL\', \'LIN\', \'LIS\', \'LJU\', \'LNZ\', \'LPA\', \'LPL\', \'LTN\', \'LUG\', \'LUX\', \'LYS\', \n \'MAD\', \'MAH\', \'MAN\', \'MJV\', \'MLA\', \'MMX\', \'MPL\', \'MRS\', \'MSQ\', \'MST\', \'MUC\', \'MXP\', \'NAP\', \'NCE\', \n \'NCL\', \'NOC\', \'NRN\', \'NTE\', \'NUE\', \'NYO\', \'ODS\', \'OLB\', \'OPO\', \'ORK\', \'ORY\', \'OSL\', \'OST\', \'OTP\', \n \'OUL\', \'PAD\', \'PDL\', \'PEG\', \'PFO\', \'PIK\', \'PMI\', \'PMO\', \'POZ\', \'PRG\', \'PRN\', \'PSA\', \'PSR\', \'PUY\', \n \'REU\', \'RHO\', \'RIX\', \'RTM\', \'RVN\', \'SAW\', \'SCQ\', \'SDR\', \'SEN\', \'SJJ\', \'SKG\', \'SKP\', \'SNN\', \'SOF\', \n \'SOU\', \'SPU\', \'STN\', \'STR\', \'SUF\', \'SVG\', \'SVO\', \'SVQ\', \'SVX\', \'SXB\', \'SXF\', \'SZG\', \'TBS\', \'TFN\', \n \'TFS\', \'TGD\', \'TIA\', \'TIV\', \'TKU\', \'TLL\', \'TLN\', \'TLS\', \'TMP\', \'TOS\', \'TPS\', \'TRD\', \'TRF\', \'TRN\', \n \'TSE\', \'TSF\', \'TSR\', \'TXL\', \'TZL\', \'TZX\', \'VAA\', \'VAR\', \'VCE\', \'VIE\', \'VKO\', \'VLC\', \'VNO\', \'VRN\', \n \'VST\', \'WAW\', \'WMI\', \'WRO\', \'XRY\', \'ZAD\', \'ZAG\', \'ZAZ\', \'ZRH\', \'ZTH\']\n\neu_countries = [\'Albania\', \'Armenia\', \'Austria\', \'Azerbaijan\', \'Belarus\', \'Belgium\', \'Bosnia And Herzegovina\', \n \'Bulgaria\', \'Croatia\', \'Cyprus\', \'Czech Republic\', \'Denmark\', \'Estonia\', \'Faroe Islands\', \'Finland\', \n \'France\', \'Georgia\', \'Germany\', \'Gibraltar\', \'Greece\', \'Hungary\', \'Iceland\', \'Ireland\', \'Italy\', \n \'Kosovo\', \'Latvia\', \'Lithuania\', \'Luxembourg\', \'Macedonia\', \'Malta\', \'Moldova\', \'Monaco\', \'Montenegro\', \n \'Netherlands\', \'Norway\', \'Poland\', \'Portugal\', \'Romania\', \'Russia\', \'Serbia\', \'Slovakia\', \'Slovenia\', \n \'Spain\', \'Sweden\', \'Switzerland\', \'Ukraine\', \'United Kingdom\']\n\n"""\neu_airlines_names = [\'Aegean Airlines\', \'Aer Lingus\', \'Aeroflot\', \'Air Baltic\', \'Air Europa\', \'Air France\', \'Alitalia\', \n \'Austrian Airlines\', \'Blue Air\', \'BRA\', \'British Airways\', \'Brussels Airlines\', \'Condor\', \'EasyJet\', \n \'Eurowings\', \'Finnair\', \'Flybe\', \'Germania\', \'HOP!\', \'Iberia\', \'Icelandair\', \'Jet2\', \'KLM\', \'LOT\', \n \'Lufthansa\', \'Norwegian\', \'Ryanair\', \'S7 Airlines\', \'SAS\', \'Swiftair\', \'Swiss\', \'TAP Portugal\', \n \'Thomas Cook Airlines\', \'Transavia\', \'Travel Service\', \'TUI fly\', \'Ukraine Int. Airlines\', \'Ural Airlines\', \n \'Virgin Atlantic\', \'Volotea\', \'Vueling\', \'Wideroe\', \'Wizz Air\']\n\neu_airlines_iata = [\'a3-aee\', \'ei-ein\', \'su-afl\', \'bt-bti\', \'ux-aea\', \'af-afr\', \'az-aza\', \'os-aua\', \'0b-bms\', \'tf-brx\', \n \'ba-baw\', \'sn-bel\', \'de-cfg\', \'u2-ezy\', \'ew-ewg\', \'ay-fin\', \'be-bee\', \'st-gmi\', \'a5-hop\', \'ib-ibe\', \n \'fi-ice\', \'ls-exs\', \'kl-klm\', \'lo-lot\', \'lh-dlh\', \'dy-nax\', \'fr-ryr\', \'s7-sbi\', \'sk-sas\', \'wt-swt\', \n \'lx-swr\', \'tp-tap\', \'mt-tcx\', \'hv-tra\', \'qs-tvs\', \'x3-tui\', \'ps-aui\', \'u6-svr\', \'vs-vir\', \'v7-voe\', \n \'vy-vlg\', \'wf-wif\', \'w6-wzz\']\n"""\neu_airlines_names = [\'KLM\']\neu_airlines_iata = [\'kl-klm\']\n\nfor airline in eu_airlines_iata:\n s = requests.session()\n r = s.get(\'https://www.flightradar24.com/data/airlines/\' + airline + \'/routes\', proxies = myProxy, headers = headers)\n my_json = json.loads(r.text.split(\'arrRoutes=\')[-1].split(\', arrDates=\')[0])\n iata_list = [element[item][\'iata\'] for element in my_json for item in element]\n\niata_list2 = []\niata_list1 = set(iata_list)\n\nfor i in iata_list1:\n if i not in eu_airports:\n pass\n else:\n iata_list2.append(i)\n\nprint(len(iata_list2))\n\ntoday = datetime.datetime.today()\ntomorrow1 = datetime.datetime.today() + datetime.timedelta(1)\ntomorrow2 = datetime.datetime.today() + datetime.timedelta(2)\ntomorrow3 = datetime.datetime.today() + datetime.timedelta(3)\ntomorrow4 = datetime.datetime.today() + datetime.timedelta(4)\ntomorrow5 = datetime.datetime.today() + datetime.timedelta(5)\ntomorrow6 = datetime.datetime.today() + datetime.timedelta(6)\n\ndate = datetime.datetime.strftime(today, "%Y-%m-%d")\ndate1 = datetime.datetime.strftime(tomorrow1, "%Y-%m-%d")\ndate2 = datetime.datetime.strftime(tomorrow2, "%Y-%m-%d")\ndate3 = datetime.datetime.strftime(tomorrow3, "%Y-%m-%d")\ndate4 = datetime.datetime.strftime(tomorrow4, "%Y-%m-%d")\ndate5 = datetime.datetime.strftime(tomorrow5, "%Y-%m-%d")\ndate6 = datetime.datetime.strftime(tomorrow6, "%Y-%m-%d")\n\ncountries = []\nairports_departure = []\nairports_arrival = []\ndailyflights = []\ndistances = []\nflights = []\naircrafts = []\nairlines = []\n\nfor airline, name in zip(eu_airlines_iata, eu_airlines_names):\n url = \'https://www.flightradar24.com/data/airlines/\' + airline + \'/routes?get-airport-arr-dep={}\'\n print(url)\n\n for abbr in iata_list2:\n try:\n cookie = r.cookies.get_dict()\n headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", "Content-Type": "application/json", "x-fetch": "true"}\n response = s.get(url.format(abbr), cookies=cookie, headers=headers, proxies = myProxy).json()\n\n for country in response[\'arrivals\']:\n if country in eu_countries:\n countries.append(country)\n daily = response[\'arrivals\'][country][\'number\'][\'flights\']\n\n if abbr not in airports_departure and abbr not in airports_arrival:\n for iata in response[\'arrivals\'][country][\'airports\']:\n if iata in eu_airports and abbr not in airports_departure:\n airports_arrival.append(iata)\n dist = response[\'arrivals\'][country][\'airports\'][iata][\'distance\']\n distances.append(int(round(dist/1000)))\n for flight in response[\'arrivals\'][country][\'airports\'][iata][\'flights\']:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date]["aircraft"]\n\n print(\'Scraping data...\')\n\n if abbr not in airports_departure:\n airports_departure.append(abbr)\n aircrafts.append(aircr)\n airlines.append(name)\n dailyflights.append(daily)\n\n else:\n pass \n\n except (IndexError, KeyError, TypeError, ValueError):\n try:\n if abbr not in airports_departure:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date2]["aircraft"]\n aircrafts.append(aircr) \n airlines.append(name)\n airports_departure.append(abbr)\n dailyflights.append(daily)\n except (IndexError, KeyError, TypeError, ValueError):\n try:\n if abbr not in airports_departure:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date3]["aircraft"]\n aircrafts.append(aircr) \n airlines.append(name)\n airports_departure.append(abbr)\n dailyflights.append(daily)\n except (IndexError, KeyError, TypeError, ValueError):\n try:\n if abbr not in airports_departure:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date4]["aircraft"]\n aircrafts.append(aircr) \n airlines.append(name)\n airports_departure.append(abbr)\n dailyflights.append(daily)\n except (IndexError, KeyError, TypeError, ValueError):\n try:\n if abbr not in airports_departure:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date5]["aircraft"]\n aircrafts.append(aircr) \n airlines.append(name)\n airports_departure.append(abbr)\n dailyflights.append(daily)\n except (IndexError, KeyError, TypeError, ValueError):\n try:\n if abbr not in airports_departure:\n aircr = response[\'arrivals\'][country][\'airports\'][iata]["flights"][flight]["utc"][date6]["aircraft"]\n aircrafts.append(aircr) \n airlines.append(name)\n airports_departure.append(abbr)\n dailyflights.append(daily)\n except (IndexError, KeyError, TypeError, ValueError):\n if abbr not in airports_departure:\n aircrafts.append(\'\')\n airlines.append(\'\')\n airports_departure.append(\'\')\n dailyflights.append(0)\n\n\nprint(\'Airline: \' + str(airlines))\nprint(\'Departure: \' + str(airports_departure))\nprint(\'Arrival: \' + str(airports_arrival))\nprint(\'Aircraft types: \' + str(aircrafts))\nprint(\'Distance (km): \' + str(distances))\nprint(\'Daily flights: \' + str(dailyflights))\n\nprint(\'Airline: \' + str(len(airlines)))\nprint(\'Departure: \' + str(len(airports_departure)))\nprint(\'Arrival: \' + str(len(airports_arrival)))\nprint(\'Aircrafts: \' + str(len(aircrafts)))\nprint(\'Distance: \' + str(len(distances)))\nprint(\'Daily flights: \' + str(len(dailyflights)))\nprint(\'Sum daily flights: \' + str(sum(dailyflights)))\n\n\ndf = pd.DataFrame({\'Airline\': airlines, \n \'Departure\': airports_departure, \n \'Arrivals\': airports_arrival, \n \'Aircraft\': aircrafts, \n \'Distance\': distances,\n \'Daily flights\': dailyflights})\nprint(df) \nRun Code Online (Sandbox Code Playgroud)\n\n这对于荷航来说效果很好,因为它的所有航班只有一个机场枢纽(史基浦机场)。然而,当我尝试抓取瑞安航空(Ryanair)等在欧洲各地拥有多个枢纽的航空公司的数据时,遇到了问题。在代码中,这可以通过将列表 eu_airlines_names 和 eu_airlines_iata 中的元素从 \xe2\x80\x98KLM\xe2\x80\x99 和 \xe2\x80\x98kl-klm\xe2\x80\x99 更改为 \xe2\x80 来完成\x98Ryanair\xe2\x80\x99 和 \xe2\x80\x98fr-ryr\xe2\x80\x99。
\n\n我该如何调整刮刀来解决这个问题?另外,是否可以循环遍历列表 eu_airlines_iata 中的多个元素,而不是逐一执行?此外,目前代码仅抓取随机飞机类型,但有没有办法替代抓取一周最常用的类型?
\n\n理想的输出将是单独的列表,其中包含:
\n\n对于我已放置在代码中的文档字符串中的 eu_airlines_iata 列表中的每家航空公司。
\n| 归档时间: |
|
| 查看次数: |
4990 次 |
| 最近记录: |