From 4aaa5b4ffa5e686d214dcaf65e986733c9179548 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 27 Sep 2020 13:19:50 -0500 Subject: [PATCH 1/3] label rows with carrier and pivot contents --- __init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/__init__.py b/__init__.py index e74a99a..a9001e3 100644 --- a/__init__.py +++ b/__init__.py @@ -22,7 +22,7 @@ def scrape_carrier(carrier_id): vehicle_type_rows = [item.find_parent('tr') for item in vehicle_type_soups] vehicle_type_table = [[item.text for item in row.find_all()] for row in vehicle_type_rows] - return (included_cargo, vehicle_type_table) + return (included_cargo, vehicle_type_table, carrier_id) def parse_carrier_ids(fp): """parse carriers ids to put into scraper""" @@ -36,11 +36,14 @@ def parse_carrier_ids(fp): def write_carrier_results(results): """writes carrier information into joinable csvs""" + cargos, vehicles, carrier_id = results with open('data/carrier.csv', 'a') as carrier_file: - csv.writer(carrier_file).writerow(results[0]) + for cargo in cargos: + csv.writer(carrier_file).writerow([carrier_id, cargo]) with open('data/carrier_vehicle.csv', 'a') as carrier_vehicle_file: - csv.writer(carrier_vehicle_file).writerow(results[1]) + for vehicle_type in vehicles: + csv.writer(carrier_vehicle_file).writerow([carrier_id, vehicle_type]) def main(fp): """scrape all carriers""" From c2f19f51c65ddc940e543200322966ba0f7e4ee2 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 27 Sep 2020 13:39:13 -0500 Subject: [PATCH 2/3] write into new timestamped directory --- __init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/__init__.py b/__init__.py index a9001e3..852e75b 100644 --- a/__init__.py +++ b/__init__.py @@ -1,6 +1,8 @@ from bs4 import BeautifulSoup import requests import csv +from datetime import datetime +import os def scrape_carrier(carrier_id): """"scrape the page of one carrier""" @@ -34,23 +36,26 @@ def parse_carrier_ids(fp): ids = [row[CARRIER_ID_COLUMN_INDEX] for row in reader] return ids -def write_carrier_results(results): +def write_carrier_results(results, directory): """writes carrier information into joinable csvs""" cargos, vehicles, carrier_id = results - with open('data/carrier.csv', 'a') as carrier_file: + with open(os.path.join(directory, 'carrier.csv'), 'a') as carrier_file: for cargo in cargos: csv.writer(carrier_file).writerow([carrier_id, cargo]) - with open('data/carrier_vehicle.csv', 'a') as carrier_vehicle_file: + with open(os.path.join(directory, 'carrier_vehicle.csv'), 'a') as carrier_vehicle_file: for vehicle_type in vehicles: csv.writer(carrier_vehicle_file).writerow([carrier_id, vehicle_type]) def main(fp): """scrape all carriers""" ids = parse_carrier_ids(fp) + runtime = datetime.now().isoformat() + directory = os.path.join('data', runtime) + os.mkdir(directory) for carrier_id in ids: results = scrape_carrier(carrier_id) - write_carrier_results(results) + write_carrier_results(results, directory) if __name__ == "__main__": main('FMCSA_CENSUS1_2020Aug/FMCSA_CENSUS1_2020Aug.txt') From 630dc7bea63dc39517beadceff0f5f039fbb5c58 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 27 Sep 2020 14:06:10 -0500 Subject: [PATCH 3/3] handle multiple columns per vehicle type --- __init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/__init__.py b/__init__.py index 852e75b..6738684 100644 --- a/__init__.py +++ b/__init__.py @@ -45,7 +45,7 @@ def write_carrier_results(results, directory): with open(os.path.join(directory, 'carrier_vehicle.csv'), 'a') as carrier_vehicle_file: for vehicle_type in vehicles: - csv.writer(carrier_vehicle_file).writerow([carrier_id, vehicle_type]) + csv.writer(carrier_vehicle_file).writerow([carrier_id] + vehicle_type) def main(fp): """scrape all carriers"""