diff --git a/.gitignore b/.gitignore index 60ed49f9f759d392aa971b990b141851c71602e2..106ac235ce37aea1d629673b74f367deb4139227 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ config.json # VSCode .vscode + +key.sh \ No newline at end of file diff --git a/bulk/mapboxMapping.py b/bulk/mapboxMapping.py new file mode 100644 index 0000000000000000000000000000000000000000..d3832f771ac1316b35002b9b526aa98e026636b6 --- /dev/null +++ b/bulk/mapboxMapping.py @@ -0,0 +1,117 @@ +import psycopg2 +import requests +import json +import time +from multiprocessing import Process, Pool +import csv +import os + +def getData(db_credentials): + + connection = psycopg2.connect(database=db_credentials[0], user = db_credentials[1], \ + password = db_credentials[2], host = db_credentials[3], port = db_credentials[4]) + print('DB Connection establised successfully!') + cursor = connection.cursor() + + query1 = ''' SELECT building_id, house_number, street_name, zipcode FROM address \ + INNER JOIN building_address ON address.id = building_address.address_id ORDER BY building_id ASC limit 120;''' + cursor.execute(query1) + rows = cursor.fetchall() + address_building_id = [] + for row in rows: + address_list = [] + for i in range(1, 4): + address_list.append(str(row[i])) + address_list[:] = [' '.join(address_list[:])] + address_list.append(row[0]) + address_building_id.append(address_list) + + connection.commit() + cursor.close() + connection.close() + + with open('selectData.csv', 'a') as f: + writer = csv.writer(f) + for i in address_building_id: + if i[1] == None: + i[1] = 0 + row = [i[0], i[1]] + writer.writerow(row) + +def mapboxData(row): + URL = "https://api.mapbox.com/geocoding/v5/mapbox.places/{}.json".format(row[0]) + PARAMS = { + 'access_token': os.environ.get('TOKEN'), + 'limit': 1, + 'region': 'New York' + } + response = requests.get(url = URL, params = PARAMS).json() + if 'features' in response: + try: + responseBody = response['features'][0] + placeName = responseBody['place_name'] + with open('mapboxData.csv', 'a') as f: + writer = csv.writer(f) + r = [row[0], row[1], placeName] + writer.writerow(r) + except: + pass + +def putData(db_credentials): + + connection = psycopg2.connect(database=db_credentials[0], user = db_credentials[1], \ + password = db_credentials[2], host = db_credentials[3], port = db_credentials[4]) + print('DB Connection establised successfully!') + cursor = connection.cursor() + + with open('mapboxData.csv', 'r') as f: + rows = csv.reader(f) + for row in rows: + if "'" in row[2]: + row[2] = row[2].replace("'", "") + query2 = ''' INSERT INTO table_for_testing (mapbox_address, building_id) VALUES ('{}', {}); '''.format(row[2], row[1]) + cursor.execute(query2) + + connection.commit() + cursor.close() + connection.close() + +def main(): + db_credentials = [os.environ.get('DATABASE'), os.environ.get('USER'), os.environ.get('DB_PASS'), os.environ.get('HOST'), os.environ.get('PORT')] + + start_time1 = time.time() + getData(db_credentials) + end_time1 = time.time() + print('SELECT TIME: ', end_time1-start_time1) + + n = 0 + batch = 25 + + start_time2 = time.time() + pool = Pool(processes=13) + with open('selectData.csv', 'r') as f: + reader = csv.reader(f) + data = list(reader) + num = len(data) + while num > 0: + print('Batch: ', n) + if num >= batch: + pool.map(mapboxData, data[(n*batch):((n+1)*batch)]) + else: + pool.map(mapboxData, data[(n*batch):]) + num -= batch + n += 1 + + end_time2 = time.time() + print('MAPBOX TIME: ', end_time2-start_time2) + + start_time3 = time.time() + putData(db_credentials) + end_time3 = time.time() + print('INSERT TIME: ', end_time3-start_time3) + + end_time = time.time() + print('TOTAL TIME: ', end_time-start_time) + +if __name__ == "__main__": + main() \ No newline at end of file