diff --git a/.github/workflows/template_build.yml b/.github/workflows/template_build.yml index f640dde2d..9d2af6575 100644 --- a/.github/workflows/template_build.yml +++ b/.github/workflows/template_build.yml @@ -50,6 +50,15 @@ jobs: steps: - uses: actions/checkout@v4 + - uses: actions/checkout@v4 + with: + repository: NYCPlanning/product-metadata + path: product_metadata + + - name: set_product_metadata_path + run: echo "PRODUCT_METADATA_REPO_PATH=$(pwd)/product_metadata" >> $GITHUB_ENV + working-directory: ./ + - name: Load Secrets uses: 1password/load-secrets-action@v1 with: diff --git a/dcpy/connectors/socrata/publish.py b/dcpy/connectors/socrata/publish.py index a7999155c..1205bcc58 100644 --- a/dcpy/connectors/socrata/publish.py +++ b/dcpy/connectors/socrata/publish.py @@ -134,7 +134,7 @@ def from_dataset_attributes(cls, attrs: md.DatasetAttributes): description=attrs.description, category=attrs.category, attribution=attrs.attribution or "", - attributionLink=attrs.attributionLink or "", + attributionLink=attrs.attribution_link or "", tags=attrs.tags or [], metadata={ "rowLabel": attrs.each_row_is_a, @@ -158,7 +158,7 @@ def from_dataset_attributes(cls, attrs: md.DatasetAttributes): "Legislative Compliance": { "Removed Records?": "Yes", # refers to row removal at time of push to Socrata. Always true since we overwrite the existing dataset. "Has Data Dictionary?": "Yes", - "Geocoded?": "Yes", + "Geocoded?": "Yes" if attrs.geocoded else "No", "External Frequency (LL 110/2015)": attrs.publishing_frequency, "Exists Externally? (LL 110/2015)": "Yes", "Contains Address?": ( diff --git a/dcpy/lifecycle/package/_cli.py b/dcpy/lifecycle/package/_cli.py index 44954906c..57a6923ce 100644 --- a/dcpy/lifecycle/package/_cli.py +++ b/dcpy/lifecycle/package/_cli.py @@ -4,12 +4,12 @@ from .esri import app as esri_app from .assemble import app as assemble_app -from .oti_xlsx import app as oti_xlsx_app +from .xlsx_writer import app as xlsx_writer_app from .shapefiles import app as shapefile_app app = typer.Typer() app.command(name="validate")(_validate) app.add_typer(esri_app, name="esri") app.add_typer(assemble_app, name="assemble") -app.add_typer(oti_xlsx_app, name="oti") +app.add_typer(xlsx_writer_app, name="oti") app.add_typer(shapefile_app, name="shapefile") diff --git a/dcpy/lifecycle/package/abstract_doc.py b/dcpy/lifecycle/package/abstract_doc.py new file mode 100644 index 000000000..3631cba10 --- /dev/null +++ b/dcpy/lifecycle/package/abstract_doc.py @@ -0,0 +1,362 @@ +from pathlib import Path +from typing import Any + +from dcpy.models.design import elements as de +from dcpy.models.product.dataset.metadata_v2 import Dataset +from dcpy.models.product.metadata import OrgMetadata +from dcpy.models.product.artifacts import Artifact, ExcelTableComponentDefinition +from dcpy.utils.logging import logger + + +# TODO; Extract these into a generic style that we can pass to the XLSX renderer +BLUE = "FF009DDC" +TITLE_FONT_SIZE = 18.0 +MONOSPACED_FONT = "Consolas" + + +def _make_title_subtitle_cell(title: str, subtitle: str): + return de.Cell( + style=de.CellStyle(text_alignment_vertical="bottom"), + value=[ + de.Cell( + value=title + " - ", + style=de.CellStyle( + font=de.Font(bold=True, size=TITLE_FONT_SIZE), + ), + ), + de.Cell( + value=subtitle, + style=de.CellStyle( + font=de.Font(bold=True, rgb=BLUE, size=TITLE_FONT_SIZE), + ), + ), + ], + ) + + +def _make_table_top( + title: str, + subtitle: str, + description: str | None = None, + maybe_image_path: Path | None = None, + column_headers: list[str] = [], + column_header_descriptions: list[str] = [], +): + rows = [] + if maybe_image_path: + rows.append( + de.Row( # maybe the image cell + skip_default_styling=True, # move + height=100, # move + merge_cells=True, + cells=[de.Cell(value=de.Image(path=maybe_image_path))], + ) + ) + + rows.append( + de.Row( # subtitle cell + merge_cells=True, + is_top_row=True, # move + height=30, # move + cells=[_make_title_subtitle_cell(title, subtitle)], + ) + ) + if description: + rows.append( + de.Row( # maybe the description cell + merge_cells=True, + height=50, + cells=[ + de.Cell( + value=description, + style=de.CellStyle( + text_alignment_vertical="center", + font=de.Font(italic=True, rgb=BLUE), + ), + ), + ], + ) + ) + if column_headers: + rows.append( + de.Row( + cells=[ + de.Cell( + style=de.CellStyle( + text_alignment_horizontal="center", + text_alignment_vertical="center", + font=de.Font(bold=True), + ), + value=col_name, + ) + for col_name in column_headers + ] + ) + ) + if column_header_descriptions: + rows.append( + de.Row( + cells=[ + de.Cell( + style=de.CellStyle( + text_alignment_vertical="center", + font=de.Font(italic=True, rgb=BLUE), + ), + value=desc, + ) + for desc in column_header_descriptions + ] + ), + ) + + return rows + + +def make_object_table( + *, + title: str, + subtitle: str, + table_rows: list[dict[str, str]], + description: str | None = None, + maybe_image_path: Path | None = None, + column_widths: list[float] | None = None, +) -> de.Table: + """Make a table for an object. + Each row is an attribute of the object. + There are only two columns + - formatted key (summary + description) + - value + """ + rows = _make_table_top( + title=title, + subtitle=subtitle, + description=description, + maybe_image_path=maybe_image_path, + ) + + for r in table_rows: + value = r.get("value") + if value is None: + logger.warning(f"Metadata field is empty for {r}") + elif type(value) is list and type(value[0]) is str: + value = ", ".join(value) + + rows.append( + de.Row( + cells=[ + de.Cell( # field title \n summary for each field + value=[ + de.Cell( + value="\n" + r["summary"] + "\n", # This isn't great + style=de.CellStyle( + font=de.Font(bold=True, size=11), + ), + ), + de.Cell( + value=(r["description"]), + style=de.CellStyle( + font=de.Font(size=9, italic=True), + ), + ), + ], + ), + de.Cell( # Value + value=value, + style=de.CellStyle(font=de.Font(italic=True)), + ), + ] + ) + ) + + return de.Table( + title=title, + subtitle=subtitle, + description=description, + rows=rows, + column_widths=column_widths or [50, 80], + ) + + +def make_list_table( + title: str, + subtitle: str, + table_rows: list[list[Any]], + column_headers: list[str] = [], + column_ids: list[str] = [], + column_descriptions: list[str] = [], + description: str | None = None, + maybe_image_path: Path | None = None, + column_widths: list[float] | None = None, + style_overrides: dict[str, de.CellStyle] = {}, +): + """Make a table for a list of objects. e.g. Revisions or columns + There is one row per column, and columns can be specified + """ + rows = _make_table_top( + title=title, + subtitle=subtitle, + description=description, + maybe_image_path=maybe_image_path, + column_headers=column_headers, + column_header_descriptions=column_descriptions, + ) + + for tr in table_rows: + cells = [] + for val, col_id in zip(tr, column_ids): + style = style_overrides.get(col_id, de.CellStyle()) + style.text_alignment_vertical = style.text_alignment_vertical or "top" + cells.append(de.Cell(value=val, style=style)) + rows.append(de.Row(cells=cells)) + + return de.Table( + title=title, + subtitle=subtitle, + description=description, + rows=rows, + column_widths=column_widths or [50, 80], + ) + + +# TODO: move to pydantic models +def get_field_metadata(data_source: str, org_metadata: OrgMetadata): + match data_source: + case "dataset.columns": + dictionary_section = org_metadata.data_dictionary.dataset["columns"] + case "dataset.revisions": + dictionary_section = org_metadata.data_dictionary.dataset["revisions"] + case "dataset.attributes": + dictionary_section = org_metadata.data_dictionary.dataset["attributes"] + case _: + raise Exception(f"Unknown data_source: {data_source}") + return dictionary_section + + +# TODO: move to pydantic models +def get_data_source( + data_source: str, dataset: Dataset, columns: list[str] = [] +) -> list[list[Any]]: + model_dict = dataset.model_dump() + match data_source: + case "dataset.columns": + source = [c.all_fields_repr() for c in dataset.columns] + if columns: + return [[r.get(c_name) for c_name in columns] for r in source] + else: + return [[k, v] for k, v in source] + case "dataset.revisions": + source = model_dict.get("revisions", []) + # convert a list[dict] -> list[str], ordered as spec'd in `columns` + columns = columns or ["date", "summary", "notes"] + return [[r.get(c_name) for c_name in columns] for r in source] + case "dataset.attributes": + return [[k, v] for k, v in dataset.attributes.model_dump().items()] + case _: + raise Exception(f"Unknown data_source: {data_source}") + + +def construct_component( + component_def: ExcelTableComponentDefinition, + dataset: Dataset, + org_metadata: OrgMetadata, +) -> de.Table: + full_image_path = ( + org_metadata.get_full_resource_path(component_def.image_path) + if component_def.image_path + else None + ) + + dictionary_section = get_field_metadata( + data_source=component_def.data_source or "", org_metadata=org_metadata + ) + + match component_def.type: + case "object_table": + # If it's an object_table, then a table row represents a field on a model. + table_rows = [] + data_source = get_data_source( + data_source=component_def.data_source or "", + dataset=dataset, + ) + data_source_dict = dict(data_source) + + for field_name in component_def.rows or []: + data_dict_field = dictionary_section[field_name] + description_paragraphs = [data_dict_field.extra_description or ""] + if component_def.extra_field_description_field: + description_paragraphs.append( + data_dict_field.custom.get( + component_def.extra_field_description_field, "" + ) + ) + table_rows.append( + { + "field_name": field_name, + "title": field_name, + "summary": data_dict_field.summary, + "description": "\n".join(description_paragraphs), + "value": data_source_dict.get(field_name), + } + ) + + return make_object_table( + title=component_def.title, + subtitle=component_def.subtitle, + description=component_def.description, + table_rows=table_rows, + maybe_image_path=full_image_path, + column_widths=component_def.column_widths, + ) + case "list_table": + # For list tables, unlike object_tables, `field_metadata_rows` will inform the columns, not the rows + data_source = get_data_source( + data_source=component_def.data_source or "", + columns=component_def.columns or [], + dataset=dataset, + ) + + assert ( + component_def.columns + ), "Columns must be specified for list_table type." + column_headers = [] + column_descriptions = [] + style_overrides = {} + for f in component_def.columns: + field = dictionary_section[f] + column_headers.append(field.summary) + column_descriptions.append(field.extra_description) + if ( + component_def.data_source == "dataset.columns" + and "values" in component_def.columns + ): # The abstraction has leaked! + style_overrides["values"] = de.CellStyle( + font=de.Font(name=MONOSPACED_FONT) + ) + + return make_list_table( + title=component_def.title, + subtitle=component_def.subtitle, + description=component_def.description, + column_ids=component_def.columns, + column_headers=column_headers, + column_descriptions=column_descriptions + if component_def.include_column_description_row + else [], + table_rows=data_source, + maybe_image_path=full_image_path, + column_widths=component_def.column_widths, + style_overrides=style_overrides, + ) + case _: + raise Exception(f"Component type {component_def.type} not implemented.") + + +def generate_abstract_artifact( + product: str, dataset: str, artifact: Artifact, org_metadata: OrgMetadata +): + ds = org_metadata.product(product).dataset(dataset).dataset + + return [ + construct_component(component_def=c, dataset=ds, org_metadata=org_metadata) + for c in artifact.components + ] diff --git a/dcpy/lifecycle/package/assemble.py b/dcpy/lifecycle/package/assemble.py index ca1291aae..c6328843d 100644 --- a/dcpy/lifecycle/package/assemble.py +++ b/dcpy/lifecycle/package/assemble.py @@ -7,7 +7,7 @@ from dcpy.configuration import PRODUCT_METADATA_REPO_PATH from dcpy.lifecycle import WORKING_DIRECTORIES -from dcpy.lifecycle.package import oti_xlsx +from dcpy.lifecycle.package import xlsx_writer from dcpy.lifecycle.package import assemble import dcpy.models.product.dataset.metadata_v2 as md import dcpy.models.product.metadata as prod_md @@ -187,15 +187,18 @@ def pull_all_destination_files(local_package_path: Path, product_metadata: md.Me def assemble_dataset_from_bytes( *, - dataset_metadata: md.Metadata, + org_md: prod_md.OrgMetadata, product: str, + dataset: str, version: str, source_destination_id: str, out_path: Path | None = None, metadata_only: bool = False, ) -> Path: - out_path = out_path or ASSEMBLY_DIR / product / version / dataset_metadata.id + out_path = out_path or ASSEMBLY_DIR / product / version / dataset logger.info(f"Assembling dataset from BYTES. Writing to: {out_path}") + + dataset_metadata = org_md.product(product).dataset(dataset) assemble.pull_destination_files( out_path, dataset_metadata, @@ -204,27 +207,20 @@ def assemble_dataset_from_bytes( metadata_only=metadata_only, ) - oti_data_dictionaries = [ + excel_data_dictionaries = [ f.file for f in dataset_metadata.files - if f.file.type == oti_xlsx.OTI_METADATA_FILE_TYPE + if f.file.type == xlsx_writer.EXCEL_DATA_DICT_METADATA_FILE_TYPE ] - for f in oti_data_dictionaries: + for f in excel_data_dictionaries: # this should eventually be generalized into something that will # generate all required missing files, or just running through a list of # packaging steps. But for now, it's just the OTI files. - overridden_md_key = f.custom.get(ASSEMBLY_INSTRUCTIONS_KEY, {}).get( - METADATA_OVERRIDE_KEY - ) - - ds_md = ( - dataset_metadata.calculate_metadata(**overridden_md_key) - if overridden_md_key - else dataset_metadata.dataset - ) logger.info(f"Generating OTI XLSX for file {f.filename}") - oti_xlsx.write_oti_xlsx( - dataset=ds_md, + xlsx_writer.write_xlsx( + org_md=org_md, + product=product, + dataset=dataset, output_path=out_path / "attachments" / f.filename, ) return out_path @@ -274,8 +270,9 @@ def assemble_dataset_from_bytes_cli( ) assemble_dataset_from_bytes( - dataset_metadata=org_md.product(product).dataset(dataset_name), + org_md=org_md, product=product, + dataset=dataset_name, source_destination_id=source_destination_id, version=version, out_path=out_path, diff --git a/dcpy/lifecycle/package/oti_xlsx.py b/dcpy/lifecycle/package/oti_xlsx.py deleted file mode 100644 index 7562a1d9e..000000000 --- a/dcpy/lifecycle/package/oti_xlsx.py +++ /dev/null @@ -1,219 +0,0 @@ -import openpyxl # type: ignore -from openpyxl.styles import Border, Side, Alignment, Font # type: ignore -from pathlib import Path -from tabulate import tabulate # type: ignore -import typer - - -from dcpy.models.product.dataset import metadata_v2 as md_v2 -from dcpy.utils.logging import logger - -from . import RESOURCES_PATH - -DEFAULT_TEMPLATE_PATH = RESOURCES_PATH / "oti_data_dictionary_template.xlsx" - - -OTI_METADATA_FILE_TYPE = "oti_data_dictionary" - - -class OTI_XLSX_TABS: - dataset_info = "Dataset Information" - column_information = "Column Information" - revision_history_information = "Dataset Revision History" - - -DISCLAIMER = """\ - This dataset is being provided by the Department of City Planning (DCP) on DCP’s\ - website for informational purposes only. DCP does not warranty the completeness,\ - accuracy, content, or fitness for any particular purpose or use of the dataset,\ - nor are any such warranties to be implied or inferred with respect to the dataset\ - as furnished on the website. DCP and the City are not liable for any deficiencies\ - in the completeness, accuracy, content, or fitness for any particular purpose or\ - use the dataset, or applications utilizing the dataset, provided by any third party.\ -""" - -# pulling this out solely for a test case. -_DESCRIPTION_ROW_INDEX = 15 - - -def _get_dataset_description(path: Path): - xlsx_wb = openpyxl.load_workbook(filename=path) - ds_info = xlsx_wb[OTI_XLSX_TABS.dataset_info] - - rows = [r for r in ds_info.rows] - return rows[_DESCRIPTION_ROW_INDEX][1].value - - -def _write_dataset_information(xlsx_wb: openpyxl.Workbook, metadata: md_v2.Dataset): - ds_info_sheet = xlsx_wb[OTI_XLSX_TABS.dataset_info] - - rows = [r for r in ds_info_sheet.rows] - - # Dataset Name: 8 - rows[8][1].value = metadata.attributes.display_name - - # Dataset URL: 9 - # TODO - - # The name of the NYC agency providing this data to the public.": 10 - rows[10][1].value = metadata.attributes.display_name - - # Each Row Is A: The unit of analysis/level of aggregation of the dataset": 11 - rows[11][1].value = metadata.attributes.each_row_is_a - - # Publishing Frequency. How often changed data is published to this dataset. For an automatically updated dataset, this is the frequency of that automation": 12 - rows[12][1].value = metadata.attributes.publishing_frequency - - # How often the data underlying this dataset is changed": 13 - rows[13][1].value = metadata.attributes.publishing_frequency - - # Frequency Details. Additional details about the publishing or data change frequency, if needed": 14 - rows[14][1].value = metadata.attributes.publishing_frequency_details - - # Dataset Description. Overview of the information this dataset contains, including overall context and definitions of key terms. This field may include links to supporting datasets, agency websites, or external resources for additional context. ": 15 - rows[_DESCRIPTION_ROW_INDEX][1].value = metadata.attributes.description - - # Why is this dataset collected. Purpose behind the collection of this data, including any legal or policy requirements for this data by NYC Executive Order, Local Law, or other policy directive.": 16 - rows[16][1].value = metadata.attributes.publishing_purpose - - # How is this data collectred? If data collection includes fielding applications, requests, or complaints, this field includes details about the forms, applications, and processes used. ": 17 - rows[17][1].value = metadata.attributes.publishing_purpose - - # "How can this data be used? What are some questions one might answer using this dataset?": 18 - rows[18][1].value = metadata.attributes.potential_uses - - # What are the unique characteristics or limitations of this dataset? Unique characteristics of this dataset to be aware of, specifically, constraints or limitations to the use of the data.": 19 - rows[19][1].value = DISCLAIMER - # For any datasets with geospatial data, specify the coordinate reference system or projection used and other relevant details.": 20 - rows[20][1].value = metadata.attributes.projection - - -def _set_default_style(cell, *, is_rightmost=True, is_last_row=False): - border_side_thin = Side(border_style="thin", color="000000") - border_side_medium = Side(border_style="medium", color="000000") - - cell.alignment = Alignment(wrapText=True, vertical="top") - cell.border = Border( - top=border_side_thin, - left=border_side_thin, - right=border_side_medium if is_rightmost else border_side_thin, - bottom=border_side_medium if is_last_row else border_side_thin, - ) - - -def _format_row_slice(row_slice, is_last_row=False): - """Format a row slice with OTI's table formatting.""" - *left_cells, rightmost_cell = row_slice - [ - _set_default_style( - c, - is_last_row=is_last_row, - is_rightmost=False, - ) - for c in left_cells - ] - _set_default_style(rightmost_cell, is_last_row=is_last_row, is_rightmost=True) - - -def _write_column_information(xlsx_wb: openpyxl.Workbook, metadata: md_v2.Dataset): - ds_info_sheet = xlsx_wb[OTI_XLSX_TABS.column_information] - - header_description_row_index = 2 - ds_info_sheet.insert_rows(header_description_row_index + 2, len(metadata.columns)) - - rows = [r for r in ds_info_sheet.rows] - - for idx, md_col in enumerate(metadata.columns): - new_row_idx = idx + header_description_row_index + 1 - - row_slice = rows[new_row_idx][0:5] - col_name, col_val, col_expected_values, col_field_limits, col_notes = row_slice - - # The basics - col_name.value = md_col.name - col_val.value = md_col.description - col_field_limits.value = "" # TODO: field limitations - col_notes.value = md_col.notes - - # Standardized Values Table - col_expected_values.font = Font( - name="Consolas" - ) # Need a monospaced font for tables columns to align - col_expected_values.value = ( - tabulate( - [ - [str(v.value) + " ", str(v.description or " ") + " "] # bool issue - for v in md_col.values - ], - headers=["Value", "Description"], - tablefmt="presto", - maxcolwidths=[10, 40], - ) - if md_col.values - else "" - ) - - _format_row_slice(row_slice, is_last_row=idx == (len(metadata.columns) - 1)) - - -def _write_change_history(xlsx_wb: openpyxl.Workbook, change_log: list[list[str]]): - rev_sheet = xlsx_wb[OTI_XLSX_TABS.revision_history_information] - header_description_row_index = 2 - - rev_sheet.insert_rows(header_description_row_index + 2, len(change_log)) - - rows = [r for r in rev_sheet.rows] - for idx, rev in enumerate(change_log): - row_slice = rows[idx + header_description_row_index + 1][0:3] - col_date, col_change_highlights, col_comments = row_slice - - col_date.value = rev[0] - col_change_highlights.value = rev[1] - col_comments.value = rev[2] - - _format_row_slice(row_slice, is_last_row=idx == (len(change_log) - 1)) - - -def write_oti_xlsx( - *, - dataset: md_v2.Dataset, - output_path: Path | None = None, - template_path_override: Path | None = None, -): - xlsx_wb = openpyxl.load_workbook( - filename=template_path_override or DEFAULT_TEMPLATE_PATH - ) - _write_dataset_information(xlsx_wb, dataset) - _write_column_information(xlsx_wb, dataset) - # TODO: this locationw will change - _write_change_history(xlsx_wb, dataset.attributes.custom.get("change_log", [])) - - out_path = output_path or Path("./data_dictionary.xlsx") - logger.info(f"Saving OTI XLSX to {out_path}") - xlsx_wb.save(out_path) - - -app = typer.Typer() - - -@app.command("generate_xlsx") -def _write_oti_xlsx_cli( - metadata_path: Path, - output_path: Path = typer.Option( - None, - "--output-path", - "-o", - help="Output Path. Defaults to ./data_dictionary.xlsx", - ), - template_path_override: Path = typer.Option( - None, - "--template-path", - "-t", - help="(Override) Template Path", - ), -): - write_oti_xlsx( - dataset=md_v2.Metadata.from_path(metadata_path).dataset, - output_path=output_path, - template_path_override=template_path_override, - ) diff --git a/dcpy/lifecycle/package/resources/oti_data_dictionary_template.xlsx b/dcpy/lifecycle/package/resources/oti_data_dictionary_template.xlsx index c53e38b9c..e9e5b9815 100644 Binary files a/dcpy/lifecycle/package/resources/oti_data_dictionary_template.xlsx and b/dcpy/lifecycle/package/resources/oti_data_dictionary_template.xlsx differ diff --git a/dcpy/lifecycle/package/xlsx_writer.py b/dcpy/lifecycle/package/xlsx_writer.py new file mode 100644 index 000000000..8b28c7ef8 --- /dev/null +++ b/dcpy/lifecycle/package/xlsx_writer.py @@ -0,0 +1,268 @@ +from copy import copy +import openpyxl # type: ignore +from openpyxl.styles import Border, Side, Alignment # type: ignore +from openpyxl.cell.text import InlineFont # type: ignore +from openpyxl.cell.rich_text import TextBlock, CellRichText # type: ignore +from openpyxl.drawing.image import Image # type: ignore +from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder # type: ignore +from openpyxl.utils import get_column_letter +from pathlib import Path +import typer + +from dcpy.configuration import PRODUCT_METADATA_REPO_PATH +from dcpy.models.product.metadata import OrgMetadata +from dcpy.models.design import elements as de +from dcpy.utils.logging import logger + +from . import RESOURCES_PATH +from . import abstract_doc + +# TODO: Move template to Product Metadata Repo. Rename to be non-OTI specific +DEFAULT_TEMPLATE_PATH = RESOURCES_PATH / "oti_data_dictionary_template.xlsx" +EXCEL_DATA_DICT_METADATA_FILE_TYPE = "excel_data_dictionary" +DEFAULT_FONT = "Arial" + + +def _set_default_style(cell, *, is_rightmost=True, is_topmost=False, is_last_row=False): + border_side_thin = Side(border_style="thin", color="000000") + border_side_medium = Side(border_style="medium", color="000000") + + cell.alignment = Alignment(wrapText=True, vertical="center") + cell.border = Border( + top=border_side_medium if is_topmost else border_side_thin, + left=border_side_thin, + right=border_side_medium if is_rightmost else border_side_thin, + bottom=border_side_medium if is_last_row else border_side_thin, + ) + + +def _format_row_slice(row_slice, is_first_row=False, is_last_row=False): + """Format a row slice with OTI's table formatting.""" + *left_cells, rightmost_cell = row_slice + [ + _set_default_style( + c, + is_last_row=is_last_row, + is_topmost=is_first_row, + is_rightmost=False, + ) + for c in left_cells + ] + _set_default_style( + rightmost_cell, + is_topmost=is_first_row, + is_last_row=is_last_row, + is_rightmost=True, + ) + + +def _abstract_style_to_xlsx(c: de.CellStyle): + return { + k: v + for k, v in { + "rFont": c.font.name or DEFAULT_FONT, + "color": c.font.rgb, + "b": c.font.bold, + "sz": c.font.size, + "i": c.font.italic, + }.items() + if v + } + + +def _to_human_readable_val(v) -> str: + if type(v) is bool: + return "Yes" if v else "No" + elif not v: + return "" + else: + return str(v) + + +def generate_table_sheet( + xlsx_wb: openpyxl.Workbook, + table: de.Table, + *, + tab_name: str, + tab_index: int = -1, + table_row_start_index=1, +): + """Adds a worksheet + table to and XLSX file.""" + new_sheet = xlsx_wb.create_sheet(title=tab_name, index=tab_index) + new_sheet.sheet_view.showGridLines = False + + new_sheet.insert_rows(table_row_start_index, len(table.rows)) + new_sheet.insert_cols(1, table.total_cols() - 1) + new_sheet_rows = [r for r in new_sheet.rows] + + # Set Column Widths when specified + dim_holder = DimensionHolder(worksheet=new_sheet) # type: ignore + for idx, col in enumerate(range(new_sheet.min_column, new_sheet.max_column + 1)): + col_dim = ColumnDimension(new_sheet, min=col, max=col) + + maybe_width = ( + table.column_widths[idx] + if table.column_widths and len(table.column_widths) > idx + else None + ) + if maybe_width: + col_dim.width = maybe_width + + dim_holder[get_column_letter(col)] = col_dim + new_sheet.column_dimensions = dim_holder + + for r_idx, r in enumerate(table.rows): + row = new_sheet_rows[r_idx] + if r.merge_cells: + # for merged cells, just format the top-leftmost cell + if not r.skip_default_styling: + _format_row_slice( + row[0:1], + is_first_row=r.is_top_row, + is_last_row=r_idx == len(table.rows) - 1, + ) + new_sheet.merge_cells( + start_row=r_idx + 1, + end_row=r_idx + 1, + start_column=1, + end_column=table.total_cols(), + ) + else: + if not r.skip_default_styling: + _format_row_slice(row, is_last_row=r_idx == len(table.rows) - 1) + + if r.height: + new_sheet.row_dimensions[r_idx + 1].height = r.height + for c_idx, c in enumerate(r.cells): + if type(c.value) is list and c.value and type(c.value[0]) is de.Cell: + # Inline Cells + cell = CellRichText( + [ + TextBlock( + InlineFont(**_abstract_style_to_xlsx(ic.style)), + _to_human_readable_val(ic.value), + ) + for ic in c.value + ] + ) + row[c_idx].value = cell + elif type(c.value) is de.Image: + cell = row[c_idx] + + # TODO: should probably use: https://openpyxl.readthedocs.io/en/3.1/api/openpyxl.utils.units.html + PIXELS_PER_INCH = 96.0 + img = Image(c.value.path) + img.height = int(1.01 * PIXELS_PER_INCH) + img.width = int(6.15 * PIXELS_PER_INCH) + new_sheet.add_image(img, cell.coordinate) # type: ignore + else: + row[c_idx].value = CellRichText( + TextBlock( + InlineFont(**_abstract_style_to_xlsx(c.style)), + _to_human_readable_val(c.value), + ) + ) + if c.style.text_alignment_vertical or c.style.text_alignment_horizontal: + alignment = copy(row[c_idx].alignment) + if c.style.text_alignment_vertical: + alignment.vertical = c.style.text_alignment_vertical + if c.style.text_alignment_horizontal: + alignment.horizontal = c.style.text_alignment_horizontal + row[c_idx].alignment = alignment + + +def write_xlsx( + *, + org_md: OrgMetadata, + product: str, + dataset: str | None = None, + output_path: Path | None = None, + template_path_override: Path | None = None, + artifact_name: str | None = None, +): + """Adds Metadata Tables to an Excel Template. + + For a given product.dataset, this will add a worksheet + table for each component specified + in the artifacts.yml in the metadata repo. + """ + + artifacts = org_md.get_packaging_artifacts() + dataset = dataset or product + + xlsx_artifacts = [a for a in artifacts if a.type == "xlsx"] + artifact = None + if artifact_name: + matched = [a for a in xlsx_artifacts if a.name == artifact_name] + if len(matched) != 1: + raise Exception(f"Expected exactly one artifact named {matched}") + artifact = matched[0] + else: + if len(xlsx_artifacts) == 1: + artifact = xlsx_artifacts[0] + else: + raise Exception("An artifact name must be specified") + + tables = abstract_doc.generate_abstract_artifact( + product=product, dataset=dataset, org_metadata=org_md, artifact=artifact + ) + xlsx_wb = openpyxl.load_workbook( + filename=template_path_override or DEFAULT_TEMPLATE_PATH + ) + + tab_and_tables = zip(artifact.components, tables) + for tab, table in tab_and_tables: + generate_table_sheet( + xlsx_wb, table=table, tab_index=tab.index, tab_name=tab.name + ) + + if "delete_me" in xlsx_wb.sheetnames: + # A saved worksheet requires at least one visible tab, so we left one called `delete_me` + del xlsx_wb["delete_me"] + out_path = output_path or Path("./data_dictionary.xlsx") + logger.info(f"Saving OTI XLSX to {out_path}") + xlsx_wb.save(out_path) + + +app = typer.Typer() + + +@app.command("generate_xlsx") +def _write_xlsx_cli( + product: str, + dataset: str, + artifact_name: Path = typer.Option( + None, + "--artifact-name", + "-n", + help="Name of the xlsx artifact to generate", + ), + output_path: Path = typer.Option( + Path("data_dictionary.xlsx"), + "--output-path", + "-o", + help="Output Path. Defaults to ./data_dictionary.xlsx", + ), + template_path_override: Path = typer.Option( + None, + "--template-path", + "-t", + help="(Override) Template Path", + ), + metadata_path_override: Path = typer.Option( + None, + "--metadata-path", + "-m", + help="Metadata repo path override", + ), +): + assert metadata_path_override or PRODUCT_METADATA_REPO_PATH + org_md = OrgMetadata.from_path( + metadata_path_override or Path(PRODUCT_METADATA_REPO_PATH) # type: ignore + ) + write_xlsx( + product=product, + dataset=dataset, + org_md=org_md, + output_path=output_path, + template_path_override=template_path_override, + ) diff --git a/dcpy/lifecycle/scripts/package_and_distribute.py b/dcpy/lifecycle/scripts/package_and_distribute.py index 9722ee050..0285e0a2d 100644 --- a/dcpy/lifecycle/scripts/package_and_distribute.py +++ b/dcpy/lifecycle/scripts/package_and_distribute.py @@ -28,8 +28,9 @@ def from_bytes_to_tagged_socrata( package_paths = {} for ds_id, dests_to_mds in dests.items(): out_path = assemble.assemble_dataset_from_bytes( - dataset_metadata=product_md.dataset(ds_id), + org_md=org_md, product=product, + dataset=ds_id, version=version, source_destination_id="bytes", metadata_only=publish_kwargs["metadata_only"], diff --git a/dcpy/models/base.py b/dcpy/models/base.py index 64485cb2d..6f1bc3afc 100644 --- a/dcpy/models/base.py +++ b/dcpy/models/base.py @@ -26,6 +26,19 @@ class SortedSerializedBase(BaseModel): _exclude_falsey_values: bool = True _head_sort_order: list[str] = PrivateAttr(default=["id"]) _tail_sort_order: list[str] = PrivateAttr(default=["custom"]) + _repr_functions: dict[str, typing.Callable[[typing.Any], str]] = {} + + def field_repr(self, field_name: str): + """overrideable method to mimic __repr__ when we have class attributes that + aren't pydantic classes. e.g. list[DatasetColumn]""" + attr = getattr(self, field_name) + if field_name in self._repr_functions: + repr_fn = self._repr_functions[field_name] + return repr_fn(attr) + return str(attr or "") + + def all_fields_repr(self) -> dict[str, str]: + return {k: self.field_repr(k) for k, _ in self.model_fields.items()} @model_serializer(mode="wrap") def _model_dump_ordered(self, handler): diff --git a/dcpy/models/design/elements.py b/dcpy/models/design/elements.py new file mode 100644 index 000000000..91882c95e --- /dev/null +++ b/dcpy/models/design/elements.py @@ -0,0 +1,47 @@ +from pydantic import BaseModel +from pathlib import Path +import typing + + +class Font(BaseModel): + name: str | None = None + size: float | None = None + rgb: str | None = None + italic: bool = False + bold: bool = False + monospaced: bool = False + + +class CellStyle(BaseModel): + font: Font = Font() + borders: list[str] | None = None + text_alignment_vertical: str | None = None + text_alignment_horizontal: str | None = None + + +class Image(BaseModel): + path: Path + + +class Cell(BaseModel): + value: typing.Any | Image | list["Cell"] # can be a value or inline cells + style: CellStyle = CellStyle() + + +class Row(BaseModel): + cells: list[Cell] + merge_cells: bool = False + is_top_row: bool = False + height: float | None = None + skip_default_styling: bool = False + + +class Table(BaseModel): + title: str + subtitle: str | None = None + description: str | None = None + rows: list[Row] + column_widths: list[float] | None = [] + + def total_cols(self): + return max(len(r.cells) for r in self.rows) diff --git a/dcpy/models/product/artifacts.py b/dcpy/models/product/artifacts.py new file mode 100644 index 000000000..e6ad602b7 --- /dev/null +++ b/dcpy/models/product/artifacts.py @@ -0,0 +1,45 @@ +from pathlib import Path + +from .dataset.metadata_v2 import CustomizableBase +from dcpy.models.base import TemplatedYamlReader + + +class ExcelTableComponentDefinition(CustomizableBase): + """Declaration for a table in an XLSX. + + A table should declare a data_source from which to pull data (and field metadata), + and specify rows OR columns. (rows AND columns makes sense in theory, but isn't implemented) + """ + + id: str + name: str + type: str # atm, either a `list_table` or `object_table`... but we should determine this from the object itself, so this attribute should go away. + index: int + data_source: str | None = None + + title: str + subtitle: str + description: str | None = None # table description + include_column_description_row: bool = ( + True # header row underneath columns, with a description of the columns + ) + + extra_field_description_field: str | None = ( + None # field from which to pull extra description paragraphs + ) + image_path: Path | None = None + rows: list[str] | None = None + columns: list[str] | None = None + column_widths: list[float] | None = ( + None # TODO: generalize away from concrete numbers. + ) + + +class Artifact(CustomizableBase, TemplatedYamlReader): + name: str + type: str + components: list[ExcelTableComponentDefinition] + + +class Artifacts(CustomizableBase, TemplatedYamlReader): + artifacts: list[Artifact] diff --git a/dcpy/models/product/data_dictionary.py b/dcpy/models/product/data_dictionary.py new file mode 100644 index 000000000..c49fccfe1 --- /dev/null +++ b/dcpy/models/product/data_dictionary.py @@ -0,0 +1,17 @@ +from .dataset.metadata_v2 import CustomizableBase +from dcpy.models.base import TemplatedYamlReader + + +class FieldDefinition(CustomizableBase): + summary: str + extra_description: str | None = None + + +class FieldSet(CustomizableBase): + fields: dict[str, FieldDefinition] = {} + + +class DataDictionary(CustomizableBase, TemplatedYamlReader): + org: dict[str, dict[str, FieldDefinition]] = {} + product: dict[str, dict[str, FieldDefinition]] = {} + dataset: dict[str, dict[str, FieldDefinition]] = {} diff --git a/dcpy/models/product/dataset/metadata_v2.py b/dcpy/models/product/dataset/metadata_v2.py index 0d7b7710f..8e176eb99 100644 --- a/dcpy/models/product/dataset/metadata_v2.py +++ b/dcpy/models/product/dataset/metadata_v2.py @@ -1,6 +1,7 @@ from __future__ import annotations from pydantic import BaseModel +from tabulate import tabulate # type: ignore from typing import Any, List import unicodedata @@ -55,19 +56,37 @@ class ColumnValue(CustomizableBase): description: str | None = None -class DatasetColumn(Column): +def make_value_table(values: list[ColumnValue]) -> str: + return ( + tabulate( + [ + [str(v.value) + " ", str(v.description or " ") + " "] # bool issue + for v in values + ], + headers=["Value", "Description"], + tablefmt="presto", + maxcolwidths=[10, 40], + ) + if values + else "" + ) + + +class DatasetColumn(CustomizableBase, Column): _head_sort_order = ["id", "name", "data_type", "description"] _tail_sort_order = ["example", "values", "custom"] + _repr_functions = {"values": make_value_table} # Note: id isn't intended to be overrideable, but is always required as a # pointer back to the original column. name: str | None = None data_source: str | None = None + description: str | None = None + limitations: str | None = None notes: str | None = None example: str | None = None deprecated: bool | None = None values: list[ColumnValue] | None = None - custom: dict[str, Any] = {} def override(self, overrides: DatasetColumn) -> DatasetColumn: return DatasetColumn(**merge(self.model_dump(), overrides.model_dump())) @@ -118,19 +137,27 @@ class DatasetOrgProductAttributesOverride(CustomizableBase): """Fields that might be set as a default at the Product/Org level.""" agency: str | None = None + agency_website_data_updated_automatically: bool | None = None attribution: str | None = None - attributionLink: str | None = None + attribution_link: str | None = None + can_be_automated: bool | None = None category: str | None = None contact_email: str | None = None contains_address: bool | None = ( None # `contains_address` refers specifically to addresses containing house, numbers + street names. (ie. not just streets, polys, etc.) ) + data_collection_method: str | None = None + data_change_frequency: str | None = None date_made_public: str | None = None + disclaimer: str | None = None + geocoded: bool | None = None + on_agency_website: bool | None = None potential_uses: str | None = None projection: str | None = None publishing_frequency: str | None = None # TODO: picklist values publishing_frequency_details: str | None = None publishing_purpose: str | None = None + rows_removed: bool | None = None tags: List[str] | None = [] @@ -158,9 +185,16 @@ class DatasetOverrides(CustomizableBase): attributes: DatasetAttributesOverride = DatasetAttributesOverride() +class Revision(CustomizableBase): + date: str + summary: str + notes: str + + class Dataset(CustomizableBase): columns: list[DatasetColumn] attributes: DatasetAttributes + revisions: list[Revision] = [] def override(self, overrides: DatasetOverrides) -> Dataset: """Apply column updates and prune any columns specified as omitted""" @@ -173,7 +207,9 @@ def override(self, overrides: DatasetOverrides) -> Dataset: ] return Dataset( - columns=columns, attributes=self.attributes.override(overrides.attributes) + columns=columns, + attributes=self.attributes.override(overrides.attributes), + revisions=self.revisions, ) @@ -216,6 +252,7 @@ class Metadata(CustomizableBase, YamlWriter, TemplatedYamlReader): columns: List[DatasetColumn] = [] files: List[FileAndOverrides] = [] destinations: List[DestinationWithFiles] = [] + revisions: list[Revision] = [] _head_sort_order = [ "id", @@ -226,7 +263,9 @@ class Metadata(CustomizableBase, YamlWriter, TemplatedYamlReader): @property def dataset(self): - return Dataset(attributes=self.attributes, columns=self.columns) + return Dataset( + attributes=self.attributes, columns=self.columns, revisions=self.revisions + ) def get_package(self, id: str) -> Package: packages = [p for p in self.assembly if p.id == id] diff --git a/dcpy/models/product/metadata.py b/dcpy/models/product/metadata.py index 7a71e44af..33e6c3d71 100644 --- a/dcpy/models/product/metadata.py +++ b/dcpy/models/product/metadata.py @@ -5,6 +5,8 @@ import yaml from dcpy.models.base import SortedSerializedBase, YamlWriter, TemplatedYamlReader +from dcpy.models.product.artifacts import Artifacts, Artifact +from dcpy.models.product.data_dictionary import DataDictionary from dcpy.models.product.dataset.metadata_v2 import ( Metadata as DatasetMetadata, DatasetColumn, @@ -123,6 +125,7 @@ class OrgMetadata(SortedSerializedBase, extra="forbid"): template_vars: dict = Field(default_factory=dict) metadata: OrgMetadataFile column_defaults: dict[tuple[str, COLUMN_TYPES], DatasetColumn] + data_dictionary: DataDictionary = DataDictionary() @classmethod def get_string_snippets(cls, path: Path) -> dict: @@ -151,6 +154,7 @@ def get_column_defaults( @classmethod def from_path(cls, path: Path, template_vars: dict | None = None): template_vars = merge(cls.get_string_snippets(path), template_vars or {}) or {} + dd_default_path = path / "data_dictionary.yml" return OrgMetadata( root_path=path, metadata=OrgMetadataFile.from_path( @@ -158,6 +162,9 @@ def from_path(cls, path: Path, template_vars: dict | None = None): ), template_vars=template_vars, column_defaults=cls.get_column_defaults(path), + data_dictionary=DataDictionary.from_path(dd_default_path) + if dd_default_path.exists() + else DataDictionary(), ) def product(self, name: str) -> ProductMetadata: @@ -183,6 +190,14 @@ def validate_metadata(self) -> dict[str, dict[str, list[str]]]: } return product_errors + def get_packaging_artifacts(self) -> list[Artifact]: + return Artifacts.from_path( + self.root_path / "packaging" / "artifacts.yml" + ).artifacts + + def get_full_resource_path(self, file: str | Path): + return self.root_path / "packaging" / "resources" / file + def query_dataset_destinations( self, tag: str ) -> list[ProductDatasetDestinationKey]: diff --git a/dcpy/test/lifecycle/package/test_abstract_docs.py b/dcpy/test/lifecycle/package/test_abstract_docs.py new file mode 100644 index 000000000..971058bfa --- /dev/null +++ b/dcpy/test/lifecycle/package/test_abstract_docs.py @@ -0,0 +1,176 @@ +from pathlib import Path +import pytest + +from dcpy.lifecycle.package import abstract_doc +from dcpy.models.product.metadata import OrgMetadata +from dcpy.models.design import elements as de + + +@pytest.fixture +def org_metadata(resources_path: Path): + return OrgMetadata.from_path(resources_path / "test_product_metadata_repo") + + +def _assert_is_title_subtitle_row(row, comp_def): + assert row.merge_cells + title_cell, subtitle_row = row.cells[0].value + assert title_cell.value.startswith( + comp_def.title + ), "The cell should contain the title" + assert ( + subtitle_row.value == comp_def.subtitle + ), "The cell should contain the subtitle" + + +def _assert_is_image_row(row): + assert row.merge_cells, "The image row should be merged" + assert 1 == len(row.cells), "the image row should have one cell" + assert type(row.cells[0].value) is de.Image, "The the value should be an image" + + +def test_generating_asset_table(org_metadata: OrgMetadata): + xlsx_artifact = org_metadata.get_packaging_artifacts()[0] + + # Limit this test to just the first component: the dataset.attributes table + xlsx_artifact.components = xlsx_artifact.components[0:1] + + dataset_attributes_comp_def = xlsx_artifact.components[0] + assert dataset_attributes_comp_def.id == "dataset_information" + + transit_zones_ds = ( + org_metadata.product("transit_zones") + .dataset("transit_zones") + .dataset.model_dump() + ) + artifacts = abstract_doc.generate_abstract_artifact( + artifact=xlsx_artifact, + org_metadata=org_metadata, + product="transit_zones", + dataset="transit_zones", + ) + assert 1 == len(artifacts) + + dataset_attributes_comp = artifacts[0] + + # Dataset attributes component tests + assert dataset_attributes_comp_def.rows + + image_row, title_subtitle_row, *attr_rows = dataset_attributes_comp.rows + assert len(dataset_attributes_comp_def.rows) == len(attr_rows) + + _assert_is_image_row(image_row) + _assert_is_title_subtitle_row(title_subtitle_row, dataset_attributes_comp_def) + + found_field_with_third_party_note = False + for field_name, row in zip(dataset_attributes_comp_def.rows, attr_rows): + data_dict_entry = org_metadata.data_dictionary.dataset["attributes"][field_name] + assert 2 == len(row.cells) + title_summary_cell, value_cell = row.cells + + # Check the Value + assert ( + transit_zones_ds["attributes"][field_name] == value_cell.value + ), "The attribute value should be correctly pulled from the dataset" + + # A little more complicated. Check the title and summary + assert 2 == len( + title_summary_cell.value + ), "the summary cell should have the correct number of subcells" + assert ( + data_dict_entry.summary in title_summary_cell.value[0].value + ), "The summary should be the top line" + + extra_description_cell = title_summary_cell.value[1].value + assert ( + data_dict_entry.extra_description in extra_description_cell + ), "The field's extra description should be included" + + if field_name == "attribution_link": + found_field_with_third_party_note = True + extra_note = data_dict_entry.custom["third_party_extra"] + assert ( + extra_note in extra_description_cell + ), "The extra description from the third party should be included in the description" + + assert ( + found_field_with_third_party_note + ), "Sanity check that we're actually testing the extra description" + + +def test_generating_revisions(org_metadata: OrgMetadata): + xlsx_artifact = org_metadata.get_packaging_artifacts()[0] + xlsx_artifact.components = xlsx_artifact.components[1:2] + + # Limit this test to just the second component: the dataset.revisions table + revisions_comp_def = xlsx_artifact.components[0] + assert revisions_comp_def.id == "revisions" + + transit_zones_ds = ( + org_metadata.product("transit_zones") + .dataset("transit_zones") + .dataset.model_dump() + ) + artifacts = abstract_doc.generate_abstract_artifact( + artifact=xlsx_artifact, + org_metadata=org_metadata, + product="transit_zones", + dataset="transit_zones", + ) + assert 1 == len(artifacts) + component = artifacts[0] + assert component + + title_subtitle_row, summary_row, column_name_row, *revision_rows = component.rows + + _assert_is_title_subtitle_row(title_subtitle_row, revisions_comp_def) + assert revisions_comp_def.description in summary_row.cells[0].value + assert len(revisions_comp_def.columns) == len(column_name_row.cells) # type: ignore + + assert len(transit_zones_ds["revisions"]) == len( + revision_rows + ), "Sanity check on destructuring above." + + +def test_column_docs(org_metadata: OrgMetadata): + xlsx_artifact = org_metadata.get_packaging_artifacts()[0] + + # Limit this test to just the third component: the dataset.columns table + xlsx_artifact.components = xlsx_artifact.components[2:3] + column_comp_def = xlsx_artifact.components[0] + assert column_comp_def.id == "columns" + + transit_zones_ds = ( + org_metadata.product("transit_zones") + .dataset("transit_zones") + .dataset.model_dump() + ) + artifacts = abstract_doc.generate_abstract_artifact( + artifact=xlsx_artifact, + org_metadata=org_metadata, + product="transit_zones", + dataset="transit_zones", + ) + assert 1 == len(artifacts) + component = artifacts[0] + assert component + + title_subtitle_row, col_names_row, col_descriptions_row, *col_rows = component.rows + + _assert_is_title_subtitle_row(title_subtitle_row, column_comp_def) + + assert len(transit_zones_ds["columns"]) == len( + col_rows + ), "There should be the correct number of column rows" + + assert column_comp_def.columns + assert "values" in column_comp_def.columns + values_col_index = column_comp_def.columns.index("values") + + # The last row is the borough, which has standardized values. + # There should be a better way to filter for this... + values_cell_sample = col_rows[-1].cells[values_col_index] + # sanity check on that though... + assert "Manhattan" in values_cell_sample.value + assert ( + abstract_doc.MONOSPACED_FONT == values_cell_sample.style.font.name + ), "The values table requires a monospaced font, otherwise the columns won't line up!" diff --git a/dcpy/test/lifecycle/package/test_assemble_from_bytes.py b/dcpy/test/lifecycle/package/test_assemble_from_bytes.py index 78cbc1139..30487bb66 100644 --- a/dcpy/test/lifecycle/package/test_assemble_from_bytes.py +++ b/dcpy/test/lifecycle/package/test_assemble_from_bytes.py @@ -1,11 +1,9 @@ from pathlib import Path import pytest -import shutil from unittest.mock import patch, call from dcpy.lifecycle.package import assemble import dcpy.models.product.dataset.metadata_v2 as ds -from dcpy.lifecycle.package import oti_xlsx SHAPEFILE = ds.File( id="shp", @@ -156,70 +154,73 @@ def test_pull_destination_files_md_only(mock_urlretrieve, tmp_path): mock_urlretrieve.assert_has_calls(expected_calls) -@pytest.fixture -def colp_package_path(resources_path: Path): - return resources_path / "product_metadata" / "colp_single_feature_package" - - -@patch("dcpy.lifecycle.package.assemble.pull_destination_files") -def test_assemble_from_bytes(pull_destination_files_mock, tmp_path, colp_package_path): - MOCK_PULLED_PACKAGE_PATH = tmp_path - pull_destination_files_mock.side_effect = lambda *args, **kwargs: shutil.copytree( - colp_package_path, MOCK_PULLED_PACKAGE_PATH, dirs_exist_ok=True - ) - metadata = ds.Metadata.from_path( - colp_package_path / "metadata.yml", template_vars={"version": "24b"} - ) - - TEST_CASE_NAME_TO_OVERRIDES = [ - ["no_overrides", {}], - ["file_overrides", {"file_id": "primary_shapefile"}], - [ - "dest_overrides", - { - "file_id": "primary_shapefile", - "destination_id": "socrata_prod", - }, - ], - ] - - for test_case_name, overrides in TEST_CASE_NAME_TO_OVERRIDES: - metadata.files.append( - ds.FileAndOverrides( - file=ds.File( - id=test_case_name, - filename=test_case_name + ".xlsx", - type=oti_xlsx.OTI_METADATA_FILE_TYPE, - custom={ - assemble.ASSEMBLY_INSTRUCTIONS_KEY: { - assemble.METADATA_OVERRIDE_KEY: overrides - } - } - if overrides - else {}, - ) - ) - ) - - assemble.assemble_dataset_from_bytes( - dataset_metadata=metadata, - source_destination_id="socrata", - out_path=tmp_path, - product="colp", - version="24c", - ) - - attachments_path = MOCK_PULLED_PACKAGE_PATH / "attachments" - assert attachments_path.exists(), "Sanity check that the mock side_effect works" - - for test_case_name, overrides in TEST_CASE_NAME_TO_OVERRIDES: - dataset = ( - metadata.calculate_metadata(**overrides) if overrides else metadata.dataset # type: ignore - ) - - xlsx_path = attachments_path / (test_case_name + ".xlsx") - assert xlsx_path.exists(), "The OTI XLSX should have been generated" - assert ( - oti_xlsx._get_dataset_description(xlsx_path) - == dataset.attributes.description - ), "The XLSX should have the correct description" +# TODO: This is a useful test, but I want to generally refactor all these product metadata tests to +# use the test_product_metadata repo. + +# @pytest.fixture +# def colp_package_path(resources_path: Path): +# return resources_path / "product_metadata" / "colp_single_feature_package" + + +# @patch("dcpy.lifecycle.package.assemble.pull_destination_files") +# def test_assemble_from_bytes(pull_destination_files_mock, tmp_path, colp_package_path): +# MOCK_PULLED_PACKAGE_PATH = tmp_path +# pull_destination_files_mock.side_effect = lambda *args, **kwargs: shutil.copytree( +# colp_package_path, MOCK_PULLED_PACKAGE_PATH, dirs_exist_ok=True +# ) +# metadata = ds.Metadata.from_path( +# colp_package_path / "metadata.yml", template_vars={"version": "24b"} +# ) + +# TEST_CASE_NAME_TO_OVERRIDES = [ +# ["no_overrides", {}], +# ["file_overrides", {"file_id": "primary_shapefile"}], +# [ +# "dest_overrides", +# { +# "file_id": "primary_shapefile", +# "destination_id": "socrata_prod", +# }, +# ], +# ] + +# for test_case_name, overrides in TEST_CASE_NAME_TO_OVERRIDES: +# metadata.files.append( +# ds.FileAndOverrides( +# file=ds.File( +# id=test_case_name, +# filename=test_case_name + ".xlsx", +# type=oti_xlsx.OTI_METADATA_FILE_TYPE, +# custom={ +# assemble.ASSEMBLY_INSTRUCTIONS_KEY: { +# assemble.METADATA_OVERRIDE_KEY: overrides +# } +# } +# if overrides +# else {}, +# ) +# ) +# ) + +# assemble.assemble_dataset_from_bytes( +# product="colp", +# dataset_metadata=metadata, +# source_destination_id="socrata", +# out_path=tmp_path, +# version="24c", +# ) + +# attachments_path = MOCK_PULLED_PACKAGE_PATH / "attachments" +# assert attachments_path.exists(), "Sanity check that the mock side_effect works" + +# for test_case_name, overrides in TEST_CASE_NAME_TO_OVERRIDES: +# dataset = ( +# metadata.calculate_metadata(**overrides) if overrides else metadata.dataset # type: ignore +# ) + +# xlsx_path = attachments_path / (test_case_name + ".xlsx") +# assert xlsx_path.exists(), "The OTI XLSX should have been generated" +# assert ( +# oti_xlsx._get_dataset_description(xlsx_path) +# == dataset.attributes.description +# ), "The XLSX should have the correct description" diff --git a/dcpy/test/lifecycle/package/test_generate_data_dictionary.py b/dcpy/test/lifecycle/package/test_generate_data_dictionary.py index c2962878b..03b1feed1 100644 --- a/dcpy/test/lifecycle/package/test_generate_data_dictionary.py +++ b/dcpy/test/lifecycle/package/test_generate_data_dictionary.py @@ -1,5 +1,5 @@ +from pathlib import Path import pytest -from unittest import TestCase from dcpy.test.lifecycle.package.conftest import ( PACKAGE_RESOURCES_PATH, TEST_ASSEMBLED_PACKAGE_AND_METADATA_PATH, @@ -7,12 +7,17 @@ TEMP_DATA_PATH, ) -from dcpy.lifecycle.package import generate_metadata_assets, oti_xlsx -from dcpy.models.product.dataset import metadata_v2 as md +from dcpy.lifecycle.package import generate_metadata_assets, xlsx_writer +from dcpy.models.product.metadata import OrgMetadata + + +@pytest.fixture +def org_metadata(resources_path: Path): + return OrgMetadata.from_path(resources_path / "test_product_metadata_repo") @pytest.mark.usefixtures("file_setup_teardown") -class TestDataDictionary(TestCase): +class TestDataDictionary(object): package_path = TEST_ASSEMBLED_PACKAGE_AND_METADATA_PATH yaml_path = TEST_METADATA_YAML_PATH html_path = TEMP_DATA_PATH / "metadata.html" @@ -85,9 +90,10 @@ def test_generate_pdf_from_yaml(self): ) assert pdf_path.exists() - def test_generate_xslx(self): - oti_xlsx.write_oti_xlsx( - dataset=md.Metadata.from_path(self.package_path / "metadata.yml").dataset, - output_path=self.output_xlsx_path, + def test_generate_xslx(self, org_metadata): + xlsx_writer.write_xlsx( + org_md=org_metadata, + product="transit_zones", # This one has some mock revision history, so it's a good test case. + output_path=TestDataDictionary.output_xlsx_path, ) - assert self.output_xlsx_path.exists() + assert TestDataDictionary.output_xlsx_path.exists() diff --git a/dcpy/test/resources/test_product_metadata_repo/data_dictionary.yml b/dcpy/test/resources/test_product_metadata_repo/data_dictionary.yml new file mode 100644 index 000000000..558acf870 --- /dev/null +++ b/dcpy/test/resources/test_product_metadata_repo/data_dictionary.yml @@ -0,0 +1,65 @@ +# Meta Metadata + +org: {} + +product: {} + +dataset: + columns: + name: + summary: Name of the column exactly as it appears in the dataset. + description: + summary: A brief, plain-language explanation of what the data in the column means. + values: + summary: > + Specifies if there is an expected range and/or format of possible values. For example, if the data type is Date & Time, + this field will note whether the timestamp is MM/DD/YYYY or MM/YYYY. If the Column Name is ice cream, this field might + note that values can be Chocolate, Vanilla or Strawberry. + extra_description: > + If relevant, this field specifies the unit of measurement of the data field, e.g. thousands, millions, $ value, miles, feet, year, etc. + limitations: + summary: > + Describes any unique characteristics or potential analytical limitations presented by this field, including: + - the reasoning for any null, zero, or empty values in the data + - if the data in the column was integrated from another dataset or organization + - if the data covered includes a different time period + - the source of the column and how the data in the column was generated. + extra_description: > + For example, information on how the data in this column was generated can include whether the data was self-reported directly by a person, + system generated by a database or agency system, derived through analytical manipulation of other fields or records; + or obtained from a different agency. + notes: + summary: > + Provides any additional relevant information about the data in the column, including: + - definitions of acronyms, special term or codes, or jargon that appears in the field values; + - the meaning of confusing or non-intuitive values in the data; + - how the information in this column relates to information in other columns; + - other unique details about this column. + extra_description: + + revisions: + date: + summary: "Date" + extra_description: "The date that the change went into effect" + summary: + summary: "Change Highlights" + extra_description: "what changed?" + notes: + summary: "Comments" + extra_description: "Whatever eles you want to say" + + attributes: + display_name: + summary: "Dataset Name" + extra_description: "" + agency: + summary: "Data Provided by" + extra_description: The name of the NYC agency providing this data to the public. + each_row_is_a: + summary: "Each row is a..." + extra_description: The unit of analysis/level of aggregation of the dataset + attribution_link: + summary: "Link to dataset" + extra_description: Where is the link to the dataset + custom: + third_party_extra: Something relevant to a third party diff --git a/dcpy/test/resources/test_product_metadata_repo/metadata.yml b/dcpy/test/resources/test_product_metadata_repo/metadata.yml index b1e3966b8..215b721b9 100644 --- a/dcpy/test/resources/test_product_metadata_repo/metadata.yml +++ b/dcpy/test/resources/test_product_metadata_repo/metadata.yml @@ -2,7 +2,7 @@ attributes: agency: "{{ agency }}" category: City Government attribution: DCP - attributionLink: https://www.nyc.gov/site/planning/data-maps/open-data.page + attribution_link: https://www.nyc.gov/site/planning/data-maps/open-data.page contact_email: opendata@planning.nyc.gov products: diff --git a/dcpy/test/resources/test_product_metadata_repo/packaging/artifacts.yml b/dcpy/test/resources/test_product_metadata_repo/packaging/artifacts.yml new file mode 100644 index 000000000..947b04790 --- /dev/null +++ b/dcpy/test/resources/test_product_metadata_repo/packaging/artifacts.yml @@ -0,0 +1,44 @@ +artifacts: + - name: oti_xlsx + type: xlsx + components: + - id: dataset_information + name: Dataset Information + type: object_table + index: 0 + title: Data Dictionary + subtitle: Dataset Information + extra_field_description_field: third_party_extra + image_path: Open_Data_Logo.png + data_source: dataset.attributes + rows: + - display_name + - agency + - each_row_is_a + - attribution_link + - id: revisions + name: Revisions + description: my test description + type: list_table + index: 1 + title: Revisions + subtitle: Product Revisions + data_source: dataset.revisions + columns: + - date + - summary + - notes + include_column_description_row: False + - id: columns + name: Column Docs + type: list_table + index: 2 + title: Data Dictionary + subtitle: Column Information + data_source: dataset.columns + columns: + - name + - description + - values + - limitations + - notes diff --git a/dcpy/test/resources/test_product_metadata_repo/packaging/resources/Open_Data_Logo.png b/dcpy/test/resources/test_product_metadata_repo/packaging/resources/Open_Data_Logo.png new file mode 100644 index 000000000..ae8cce49e Binary files /dev/null and b/dcpy/test/resources/test_product_metadata_repo/packaging/resources/Open_Data_Logo.png differ diff --git a/dcpy/test/resources/test_product_metadata_repo/products/transit_zones/transit_zones/metadata.yml b/dcpy/test/resources/test_product_metadata_repo/products/transit_zones/transit_zones/metadata.yml index 0d7459758..0443748ea 100644 --- a/dcpy/test/resources/test_product_metadata_repo/products/transit_zones/transit_zones/metadata.yml +++ b/dcpy/test/resources/test_product_metadata_repo/products/transit_zones/transit_zones/metadata.yml @@ -57,3 +57,34 @@ columns: name: Shape_Area data_type: decimal description: Area of feature in internal units squared. + - id: fake_borough + name: Borough + data_type: text + description: + NYC borough - 1 (Manhattan), 2 (Bronx), 3 (Brooklyn), 4 (Queens), 5 + (Staten Island) + checks: + non_nullable: true + example: None + values: + - value: "1" + description: Manhattan + - value: "2" + description: Bronx + - value: "3" + description: Brooklyn + - value: "4" + description: Queens + - value: "5" + description: Staten Island + +revisions: + - date: "2024-01-01" + summary: added column + notes: explanation for why we added a column + - date: "2024-03-01" + summary: removed column + notes: explanation for why we removed a column + - date: "2024-04-01" + summary: one more for good measure + notes: Another revision diff --git a/products/template/build_scripts/export.py b/products/template/build_scripts/export.py index 93fcc354f..fb4c2bd79 100644 --- a/products/template/build_scripts/export.py +++ b/products/template/build_scripts/export.py @@ -1,10 +1,12 @@ +from pathlib import Path import shutil +from dcpy.configuration import PRODUCT_METADATA_REPO_PATH from dcpy.lifecycle.package import generate_metadata_assets -from dcpy.lifecycle.package import oti_xlsx +from dcpy.lifecycle.package import xlsx_writer from dcpy.connectors.edm import product_metadata, publishing from dcpy.utils.logging import logger -from dcpy.models.product.dataset import metadata_v2 as md +from dcpy.models.product.metadata import OrgMetadata from . import PRODUCT_PATH, OUTPUT_DIR, PG_CLIENT, BUILD_KEY @@ -22,6 +24,9 @@ ], } +assert PRODUCT_METADATA_REPO_PATH +org_metadata = OrgMetadata.from_path(Path(PRODUCT_METADATA_REPO_PATH)) + def generate_metadata(): dataset_metadata_yml = product_metadata.download( @@ -37,8 +42,9 @@ def generate_metadata(): PRODUCT_PATH / "data_dictionary.pdf", generate_metadata_assets.DEFAULT_DATA_DICTIONARY_STYLESHEET_PATH, ) - oti_xlsx.write_oti_xlsx( - dataset=md.Metadata.from_path(dataset_metadata_yml).dataset, + xlsx_writer.write_xlsx( + org_md=org_metadata, + product="template_db", output_path=PRODUCT_PATH / "data_dictionary.xlsx", )