From 744ed51ed500df23506ce34f8d565fb5acfa1974 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Thu, 15 Dec 2022 09:45:49 -0500 Subject: [PATCH 01/10] Loader will not fail on malformed records, instead will count and skip them --- pygtfs/loader.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pygtfs/loader.py b/pygtfs/loader.py index 9905ff2..ccaee57 100644 --- a/pygtfs/loader.py +++ b/pygtfs/loader.py @@ -77,7 +77,8 @@ def append_feed(schedule, feed_filename, strip_fields=True, continue gtfs_table = gtfs_tables[gtfs_class] - + skipped_records = 0 + read_records = 0 for i, record in enumerate(gtfs_table): if not record: # Empty row. @@ -85,16 +86,18 @@ def append_feed(schedule, feed_filename, strip_fields=True, try: instance = gtfs_class(feed_id=feed_id, **record._asdict()) + schedule.session.add(instance) + read_records += 1 except: - print("Failure while writing {0}".format(record)) - raise - schedule.session.add(instance) + skipped_records += 1 + print(f"Failure while writing {record}") + # raise if i % chunk_size == 0 and i > 0: schedule.session.flush() sys.stdout.write('.') sys.stdout.flush() - print('%d record%s read for %s.' % ((i+1), '' if i == 0 else 's', - gtfs_class)) + print(f'{read_records} records read for {gtfs_class}') + print(f'{skipped_records} records skipped for {gtfs_class}') schedule.session.flush() schedule.session.commit() # load many to many relationships From 5fbbd847cedc095ffad9bb36c86161844b998686 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Thu, 15 Dec 2022 10:18:13 -0500 Subject: [PATCH 02/10] Added option to fail append_feed if bad records are found --- pygtfs/loader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pygtfs/loader.py b/pygtfs/loader.py index ccaee57..31508e1 100644 --- a/pygtfs/loader.py +++ b/pygtfs/loader.py @@ -1,17 +1,16 @@ from __future__ import (division, absolute_import, print_function, unicode_literals) -from datetime import date import sys +from datetime import date import six -from sqlalchemy import and_ -from sqlalchemy.sql.expression import select, join -from .gtfs_entities import (Feed, Service, ServiceException, gtfs_required, +from . import feed +from .exceptions import PygtfsException +from .gtfs_entities import (Feed, gtfs_required, Translation, Stop, Trip, ShapePoint, _stop_translations, _trip_shapes, gtfs_calendar, gtfs_all) -from . import feed def list_feeds(schedule): @@ -46,7 +45,7 @@ def overwrite_feed(schedule, feed_filename, *args, **kwargs): def append_feed(schedule, feed_filename, strip_fields=True, - chunk_size=5000, agency_id_override=None): + chunk_size=5000, agency_id_override=None, ignore_failures=True): fd = feed.Feed(feed_filename, strip_fields) @@ -91,7 +90,8 @@ def append_feed(schedule, feed_filename, strip_fields=True, except: skipped_records += 1 print(f"Failure while writing {record}") - # raise + if not ignore_failures: + raise if i % chunk_size == 0 and i > 0: schedule.session.flush() sys.stdout.write('.') From 1baa416bcd73021ab520f7c342f5c5e082efe2a4 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Thu, 15 Dec 2022 13:57:08 -0500 Subject: [PATCH 03/10] Specified that stop_lat and stop_long are allowed to be empty --- pygtfs/gtfs_entities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index 8244006..cdb2d24 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -145,8 +145,8 @@ class Stop(Base): stop_code = Column(Unicode, nullable=True, index=True) stop_name = Column(Unicode) stop_desc = Column(Unicode, nullable=True) - stop_lat = Column(Float) - stop_lon = Column(Float) + stop_lat = Column(Float, nullable=True) + stop_lon = Column(Float, nullable=True) zone_id = Column(Unicode, nullable=True) stop_url = Column(Unicode, nullable=True) location_type = Column(Integer, nullable=True) From e4acef5a99dbcf48acf551bc23589532b24a1d5e Mon Sep 17 00:00:00 2001 From: James Budarz Date: Thu, 15 Dec 2022 15:14:09 -0500 Subject: [PATCH 04/10] Allowing transfer_type up to 5 and allowing None for stop_lat and stop_long --- pygtfs/gtfs_entities.py | 4 +++- pygtfs/loader.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index cdb2d24..7edac89 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -70,6 +70,8 @@ def in_range(self, key, value): def _validate_float_range(float_min, float_max, *field_names): @validates(*field_names) def in_range(self, key, value): + if value is None: + return value float_value = float(value) if not (float_min <= float_value <= float_max): raise PygtfsValidationError( @@ -507,7 +509,7 @@ class Transfer(Base): primaryjoin=and_(Trip.trip_id == foreign(to_trip_id), Trip.feed_id == feed_id)) - _validate_transfer_type = _validate_int_choice([None, 0, 1, 2, 3], + _validate_transfer_type = _validate_int_choice([None, 0, 1, 2, 3, 4, 5], 'transfer_type') def __repr__(self): diff --git a/pygtfs/loader.py b/pygtfs/loader.py index 31508e1..7fad653 100644 --- a/pygtfs/loader.py +++ b/pygtfs/loader.py @@ -20,7 +20,6 @@ def list_feeds(schedule): def delete_feed(schedule, feed_filename, interactive=False): - feed_name = feed.derive_feed_name(feed_filename) feeds_with_name = schedule.session.query(Feed).filter(Feed.feed_name == feed_name).all() delete_all = not interactive @@ -46,7 +45,6 @@ def overwrite_feed(schedule, feed_filename, *args, **kwargs): def append_feed(schedule, feed_filename, strip_fields=True, chunk_size=5000, agency_id_override=None, ignore_failures=True): - fd = feed.Feed(feed_filename, strip_fields) gtfs_tables = {} From dab51b0d69846bda13fd26ddacc459b8d6554cdc Mon Sep 17 00:00:00 2001 From: James Budarz Date: Thu, 15 Dec 2022 15:42:20 -0500 Subject: [PATCH 05/10] Changed return statement for conformity --- pygtfs/gtfs_entities.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index 7edac89..f8e2a63 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -71,7 +71,7 @@ def _validate_float_range(float_min, float_max, *field_names): @validates(*field_names) def in_range(self, key, value): if value is None: - return value + return None float_value = float(value) if not (float_min <= float_value <= float_max): raise PygtfsValidationError( @@ -330,7 +330,6 @@ class Trip(Base): primaryjoin=and_(foreign(service_id) == Service.service_id, feed_id == Service.feed_id)) - _validate_direction_id = _validate_int_choice([None, 0, 1], 'direction_id') _validate_wheelchair = _validate_int_choice([None, 0, 1, 2], 'wheelchair_accessible') @@ -545,7 +544,8 @@ def __repr__(self): Column('trans_id', Unicode), Column('lang', Unicode), ForeignKeyConstraint(['stop_feed_id', 'stop_id'], [Stop.feed_id, Stop.stop_id]), - ForeignKeyConstraint(['translation_feed_id', 'trans_id', 'lang'], [Translation.feed_id, Translation.trans_id, Translation.lang]), + ForeignKeyConstraint(['translation_feed_id', 'trans_id', 'lang'], + [Translation.feed_id, Translation.trans_id, Translation.lang]), ) From 8e486e672ca540189e1b482b3a99b2090b49f16b Mon Sep 17 00:00:00 2001 From: James Budarz Date: Tue, 20 Dec 2022 17:18:31 -0500 Subject: [PATCH 06/10] Changed f-strings for Python 2.7 compatibility --- pygtfs/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygtfs/loader.py b/pygtfs/loader.py index 7fad653..d3e23be 100644 --- a/pygtfs/loader.py +++ b/pygtfs/loader.py @@ -87,15 +87,15 @@ def append_feed(schedule, feed_filename, strip_fields=True, read_records += 1 except: skipped_records += 1 - print(f"Failure while writing {record}") + print("Failure while writing {}".format(record)) if not ignore_failures: raise if i % chunk_size == 0 and i > 0: schedule.session.flush() sys.stdout.write('.') sys.stdout.flush() - print(f'{read_records} records read for {gtfs_class}') - print(f'{skipped_records} records skipped for {gtfs_class}') + print('{0} records read for {1}'.format(read_records, gtfs_class)) + print('{0} records skipped for {1}'.format(skipped_records, gtfs_class)) schedule.session.flush() schedule.session.commit() # load many to many relationships From c18a4239610c9374ef10e99d9f95467d17357259 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Fri, 6 Jan 2023 14:21:56 -0500 Subject: [PATCH 07/10] Made default value for ignoring corrupt lines False --- pygtfs/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygtfs/loader.py b/pygtfs/loader.py index d3e23be..1972bc8 100644 --- a/pygtfs/loader.py +++ b/pygtfs/loader.py @@ -44,7 +44,7 @@ def overwrite_feed(schedule, feed_filename, *args, **kwargs): def append_feed(schedule, feed_filename, strip_fields=True, - chunk_size=5000, agency_id_override=None, ignore_failures=True): + chunk_size=5000, agency_id_override=None, ignore_failures=False): fd = feed.Feed(feed_filename, strip_fields) gtfs_tables = {} From 6b358a7bf3a54bc2f2b68cddc6b264de75190a18 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Fri, 6 Jan 2023 14:33:44 -0500 Subject: [PATCH 08/10] Added _validate_nullable_float_range function for lat and long validation --- pygtfs/gtfs_entities.py | 57 ++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index f8e2a63..22fbba6 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -26,6 +26,7 @@ def _validate_date(*field_names): @validates(*field_names) def make_date(self, key, value): return datetime.datetime.strptime(value, '%Y%m%d').date() + return make_date @@ -37,6 +38,7 @@ def time_delta(self, key, value): (hours, minutes, seconds) = map(int, value.split(":")) return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) + return time_delta @@ -47,6 +49,7 @@ def int_bool(self, key, value): raise PygtfsValidationError("{0} must be 0 or 1, " "was {1}".format(key, value)) return value == "1" + return int_bool @@ -64,10 +67,24 @@ def in_range(self, key, value): raise PygtfsValidationError( "{0} must be in range {1}, was {2}".format(key, int_choice, value)) return int_value + return in_range def _validate_float_range(float_min, float_max, *field_names): + @validates(*field_names) + def in_range(self, key, value): + float_value = float(value) + if not (float_min <= float_value <= float_max): + raise PygtfsValidationError( + "{0} must be in range [{1}, {2}]," + " was {2}".format(key, float_min, float_max, value)) + return float_value + + return in_range + + +def _validate_nullable_float_range(float_min, float_max, *field_names): @validates(*field_names) def in_range(self, key, value): if value is None: @@ -78,6 +95,7 @@ def in_range(self, key, value): "{0} must be in range [{1}, {2}]," " was {2}".format(key, float_min, float_max, value)) return float_value + return in_range @@ -87,6 +105,7 @@ def is_float_none(self, key, value): if value is None or value == "": return None return float(value) + return is_float_none @@ -164,8 +183,8 @@ class Stop(Base): _validate_location = _validate_int_choice([None, 0, 1, 2, 3, 4], 'location_type') _validate_wheelchair = _validate_int_choice([None, 0, 1, 2], 'wheelchair_boarding') - _validate_lon_lat = _validate_float_range(-180, 180, 'stop_lon', - 'stop_lat') + _validate_lon_lat = _validate_nullable_float_range(-180, 180, 'stop_lon', + 'stop_lat') def __repr__(self): return '' % (self.stop_id, self.stop_name) @@ -191,8 +210,8 @@ class Route(Base): ) agency = relationship(Agency, backref="routes", - primaryjoin=and_(Agency.agency_id==foreign(agency_id), - Agency.feed_id==feed_id)) + primaryjoin=and_(Agency.agency_id == foreign(agency_id), + Agency.feed_id == feed_id)) # https://developers.google.com/transit/gtfs/reference/extended-route-types valid_extended_route_types = [ @@ -318,17 +337,17 @@ class Trip(Base): ) route = relationship(Route, backref="trips", - primaryjoin=and_(Route.route_id==foreign(route_id), - Route.feed_id==feed_id)) + primaryjoin=and_(Route.route_id == foreign(route_id), + Route.feed_id == feed_id)) shape_points = relationship(ShapePoint, backref="trips", - secondary="_trip_shapes") + secondary="_trip_shapes") # TODO: The service_id references to calendar or to calendar_dates. # Need to implement this requirement, but not using a simple foreign key. service = relationship(Service, backref='trips', - primaryjoin=and_(foreign(service_id) == Service.service_id, - feed_id == Service.feed_id)) + primaryjoin=and_(foreign(service_id) == Service.service_id, + feed_id == Service.feed_id)) _validate_direction_id = _validate_int_choice([None, 0, 1], 'direction_id') _validate_wheelchair = _validate_int_choice([None, 0, 1, 2], @@ -375,11 +394,11 @@ class StopTime(Base): ) stop = relationship(Stop, backref='stop_times', - primaryjoin=and_(Stop.stop_id==foreign(stop_id), - Stop.feed_id==feed_id)) + primaryjoin=and_(Stop.stop_id == foreign(stop_id), + Stop.feed_id == feed_id)) trip = relationship(Trip, backref="stop_times", - primaryjoin=and_(Trip.trip_id==foreign(trip_id), - Trip.feed_id==feed_id)) + primaryjoin=and_(Trip.trip_id == foreign(trip_id), + Trip.feed_id == feed_id)) _validate_pickup_drop_off = _validate_int_choice([None, 0, 1, 2, 3], 'pickup_type', @@ -433,8 +452,8 @@ class FareRule(Base): ) route = relationship(Route, backref="fare_rules", - primaryjoin=and_(Route.route_id==foreign(route_id), - Route.feed_id==feed_id)) + primaryjoin=and_(Route.route_id == foreign(route_id), + Route.feed_id == feed_id)) def __repr__(self): return '' % (self.fare_id, @@ -459,8 +478,8 @@ class Frequency(Base): ) trip = relationship(Trip, backref="frequencies", - primaryjoin=and_(Trip.trip_id==foreign(trip_id), - Trip.feed_id==feed_id)) + primaryjoin=and_(Trip.trip_id == foreign(trip_id), + Trip.feed_id == feed_id)) _validate_exact_times = _validate_int_choice([None, 0, 1], 'exact_times') _validate_deltas = _validate_time_delta('start_time', 'end_time') @@ -548,7 +567,6 @@ def __repr__(self): [Translation.feed_id, Translation.trans_id, Translation.lang]), ) - _trip_shapes = Table( '_trip_shapes', Base.metadata, Column('trip_feed_id', Integer), @@ -558,10 +576,9 @@ def __repr__(self): Column('shape_pt_sequence', Integer), ForeignKeyConstraint(['trip_feed_id', 'trip_id'], [Trip.feed_id, Trip.trip_id]), ForeignKeyConstraint(['shape_feed_id', 'shape_id', 'shape_pt_sequence'], - [ShapePoint.feed_id, ShapePoint.shape_id, ShapePoint.shape_pt_sequence]), + [ShapePoint.feed_id, ShapePoint.shape_id, ShapePoint.shape_pt_sequence]), ) - # a feed can skip Service (calendar) if it has ServiceException(calendar_dates) gtfs_required = {Agency, Stop, Route, Trip, StopTime} gtfs_calendar = {Service, ServiceException} From a4c679d2ed2e9dba91d11088e320dd6cc8374ef1 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Fri, 6 Jan 2023 14:38:14 -0500 Subject: [PATCH 09/10] Simplified nullable float check to a single function --- pygtfs/gtfs_entities.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index 22fbba6..6c186d5 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -71,23 +71,10 @@ def in_range(self, key, value): return in_range -def _validate_float_range(float_min, float_max, *field_names): +def _validate_float_range(float_min, float_max, nullable, *field_names): @validates(*field_names) def in_range(self, key, value): - float_value = float(value) - if not (float_min <= float_value <= float_max): - raise PygtfsValidationError( - "{0} must be in range [{1}, {2}]," - " was {2}".format(key, float_min, float_max, value)) - return float_value - - return in_range - - -def _validate_nullable_float_range(float_min, float_max, *field_names): - @validates(*field_names) - def in_range(self, key, value): - if value is None: + if nullable and value is None: return None float_value = float(value) if not (float_min <= float_value <= float_max): @@ -166,8 +153,9 @@ class Stop(Base): stop_code = Column(Unicode, nullable=True, index=True) stop_name = Column(Unicode) stop_desc = Column(Unicode, nullable=True) - stop_lat = Column(Float, nullable=True) - stop_lon = Column(Float, nullable=True) + nullable_lat_long = True + stop_lat = Column(Float, nullable=nullable_lat_long) + stop_lon = Column(Float, nullable=nullable_lat_long) zone_id = Column(Unicode, nullable=True) stop_url = Column(Unicode, nullable=True) location_type = Column(Integer, nullable=True) @@ -183,8 +171,8 @@ class Stop(Base): _validate_location = _validate_int_choice([None, 0, 1, 2, 3, 4], 'location_type') _validate_wheelchair = _validate_int_choice([None, 0, 1, 2], 'wheelchair_boarding') - _validate_lon_lat = _validate_nullable_float_range(-180, 180, 'stop_lon', - 'stop_lat') + _validate_lon_lat = _validate_float_range(-180, 180, nullable_lat_long, + 'stop_lon', 'stop_lat') def __repr__(self): return '' % (self.stop_id, self.stop_name) @@ -257,7 +245,7 @@ class ShapePoint(Base): Index('idx_shape_for_trips', feed_id, shape_id), ) - _validate_lon_lat = _validate_float_range(-180, 180, + _validate_lon_lat = _validate_float_range(-180, 180, False, 'shape_pt_lon', 'shape_pt_lat') _validate_shape_dist_traveled = _validate_float_none('shape_dist_traveled') From f1f9b1844e5c55fd8a6d9e76ceadc675fb0db199 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Fri, 6 Jan 2023 14:52:11 -0500 Subject: [PATCH 10/10] Clarified where lat and long are nullable for clarity --- pygtfs/gtfs_entities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pygtfs/gtfs_entities.py b/pygtfs/gtfs_entities.py index 6c186d5..d8bd42b 100644 --- a/pygtfs/gtfs_entities.py +++ b/pygtfs/gtfs_entities.py @@ -236,6 +236,7 @@ class ShapePoint(Base): _plural_name_ = 'shapes' feed_id = Column(Integer, ForeignKey('_feed.feed_id'), primary_key=True) shape_id = Column(Unicode, primary_key=True) + nullable_lat_long = False shape_pt_lat = Column(Float) shape_pt_lon = Column(Float) shape_pt_sequence = Column(Integer, primary_key=True) @@ -245,7 +246,7 @@ class ShapePoint(Base): Index('idx_shape_for_trips', feed_id, shape_id), ) - _validate_lon_lat = _validate_float_range(-180, 180, False, + _validate_lon_lat = _validate_float_range(-180, 180, nullable_lat_long, 'shape_pt_lon', 'shape_pt_lat') _validate_shape_dist_traveled = _validate_float_none('shape_dist_traveled')