From 6e4ef5bdcf54a8db190385f84d8924223a7f38ef Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Wed, 15 Oct 2025 18:29:34 -0400 Subject: [PATCH 1/4] Format data_analyzer.py and text_generator_plugins.py --- dbldatagen/data_analyzer.py | 600 +++++++++++++++------------ dbldatagen/text_generator_plugins.py | 400 ++++++++++-------- pyproject.toml | 8 - 3 files changed, 562 insertions(+), 446 deletions(-) diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py index 195c49b9..58d1e501 100644 --- a/dbldatagen/data_analyzer.py +++ b/dbldatagen/data_analyzer.py @@ -8,60 +8,72 @@ This code is experimental and both APIs and code generated is liable to change in future versions. """ import logging +from typing import SupportsFloat, SupportsIndex -import pyspark.sql as ssql -from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \ - TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType +from pyspark.sql import DataFrame, Row, SparkSession, types -from .spark_singleton import SparkSingleton -from .utils import strip_margins +from dbldatagen.spark_singleton import SparkSingleton +from dbldatagen.utils import strip_margins -SUMMARY_FIELD_NAME = "summary" -SUMMARY_FIELD_NAME_RENAMED = "__summary__" -DATA_SUMMARY_FIELD_NAME = "__data_summary__" + +SUMMARY_FIELD_NAME: str = "summary" +SUMMARY_FIELD_NAME_RENAMED: str = "__summary__" +DATA_SUMMARY_FIELD_NAME: str = "__data_summary__" class DataAnalyzer: - """This class is used to analyze an existing data set to assist in generating a test data set with similar - characteristics, and to generate code from existing schemas and data + """ + This class is used to analyze an existing dataset to assist in generating a test data set with similar data + characteristics. Analyzer results and to generate code from existing schemas and data. - :param df: Spark dataframe to analyze - :param sparkSession: Spark session instance to use when performing spark operations - :param debug: If True, additional debug information is logged - :param verbose: If True, additional information is logged + :param df: Spark ``DataFrame`` to analyze + :param sparkSession: ``SparkSession`` to use + :param debug: Whether to log additional debug information (default `False`) + :param verbose: Whether to log detailed execution information (default `False`) .. warning:: Experimental - """ - _DEFAULT_GENERATED_NAME = "synthetic_data" - - _GENERATED_COMMENT = strip_margins(""" - |# Code snippet generated with Databricks Labs Data Generator (`dbldatagen`) DataAnalyzer class - |# Install with `pip install dbldatagen` or in notebook with `%pip install dbldatagen` - |# See the following resources for more details: - |# - |# Getting Started - [https://databrickslabs.github.io/dbldatagen/public_docs/APIDOCS.html] - |# Github project - [https://github.com/databrickslabs/dbldatagen] - |#""", '|') - - _GENERATED_FROM_SCHEMA_COMMENT = strip_margins(""" - |# Column definitions are stubs only - modify to generate correct data - |#""", '|') - - def __init__(self, df=None, sparkSession=None, debug=False, verbose=False): - """ Constructor: - :param df: Dataframe to analyze - :param sparkSession: Spark session to use + debug: bool + verbose: bool + _sparkSession: SparkSession + _df: DataFrame + _dataSummary: dict[str, dict[str, object]] | None + _DEFAULT_GENERATED_NAME: str = "synthetic_data" + _GENERATED_COMMENT: str = strip_margins( + """ + |# Code snippet generated with Databricks Labs Data Generator (`dbldatagen`) DataAnalyzer class + |# Install with `pip install dbldatagen` or in notebook with `%pip install dbldatagen` + |# See the following resources for more details: + |# + |# Getting Started - [https://databrickslabs.github.io/dbldatagen/public_docs/APIDOCS.html] + |# Github project - [https://github.com/databrickslabs/dbldatagen] + |# + """, + marginChar="|" + ) + + _GENERATED_FROM_SCHEMA_COMMENT: str = strip_margins( """ - # set up logging + |# Column definitions are stubs only - modify to generate correct data + |# + """, + marginChar="|" + ) + + def __init__( + self, + df: DataFrame | None = None, + sparkSession: SparkSession | None = None, + debug: bool = False, + verbose: bool = False + ) -> None: self.verbose = verbose self.debug = debug - self._setupLogger() - assert df is not None, "dataframe must be supplied" - + if df is None: + raise ValueError("Argument `df` must be supplied when initializing a `DataAnalyzer`") self._df = df if sparkSession is None: @@ -70,10 +82,10 @@ def __init__(self, df=None, sparkSession=None, debug=False, verbose=False): self._sparkSession = sparkSession self._dataSummary = None - def _setupLogger(self): - """Set up logging - - This will set the logger at warning, info or debug levels depending on the instance construction parameters + def _setupLogger(self) -> None: + """ + Sets up logging for the ``DataAnalyzer``. Configures the logger at warning, info or debug levels depending on + the user-requested behavior. """ self.logger = logging.getLogger("DataAnalyzer") if self.debug: @@ -83,49 +95,64 @@ def _setupLogger(self): else: self.logger.setLevel(logging.WARNING) - def _displayRow(self, row): - """Display details for row""" - results = [] + @staticmethod + def _displayRow(row: Row) -> str: + """ + Displays details for a row as a string. + + :param row: PySpark ``Row`` object to display + :returns: String representing row-level details + """ row_key_pairs = row.asDict() - for x in row_key_pairs: - results.append(f"{x}: {row[x]}") - - return ", ".join(results) - - def _addMeasureToSummary(self, measureName, *, summaryExpr="''", fieldExprs=None, dfData=None, rowLimit=1, - dfSummary=None): - """ Add a measure to the summary dataframe - - :param measureName: Name of measure - :param summaryExpr: Summary expression - :param fieldExprs: list of field expressions (or generator) - :param dfData: Source data df - data being summarized - :param rowLimit: Number of rows to get for measure - :param dfSummary: Summary df - :return: dfSummary with new measure added + return ",".join([f"{x}: {row[x]}" for x in row_key_pairs]) + + @staticmethod + def _addMeasureToSummary( + measureName: str, + *, + summaryExpr: str = "''", + fieldExprs: list[str] | None = None, + dfData: DataFrame | None, + rowLimit: int = 1, + dfSummary: DataFrame | None = None + ) -> DataFrame: + """ + Adds a new measure to the summary ``DataFrame``. + + :param measureName: Measure name + :param summaryExpr: Measure expression as a Spark SQL statement + :param fieldExprs: Optional list of field expressions as Spark SQL Statements + :param dfData: Source ``DataFrame`` to summarize + :param rowLimit: Number of rows to use for ``DataFrame`` summarization + :param dfSummary: Summary metrics ``DataFrame`` + :return: Summary metrics ``DataFrame`` with the added measure """ - assert dfData is not None, "source data dataframe must be supplied" - assert measureName is not None and len(measureName) > 0, "invalid measure name" + if dfData is None: + raise ValueError("Input DataFrame `dfData` must be supplied when adding measures to a summary") + + if measureName is None: + raise ValueError("Input measure name must be a non-empty string") # add measure name and measure summary - exprs = [f"'{measureName}' as measure_", f"string({summaryExpr}) as summary_"] + expressions = [f"'{measureName}' as measure_", f"string({summaryExpr}) as summary_"] - # add measures for fields - exprs.extend(fieldExprs) + if fieldExprs: + expressions.extend(fieldExprs) if dfSummary is not None: - dfResult = dfSummary.union(dfData.selectExpr(*exprs).limit(rowLimit)) - else: - dfResult = dfData.selectExpr(*exprs).limit(rowLimit) + return dfSummary.union(dfData.selectExpr(*expressions).limit(rowLimit)) - return dfResult + return dfData.selectExpr(*expressions).limit(rowLimit) - def _get_dataframe_describe_stats(self, df): - """ Get summary statistics for dataframe handling renaming of summary field if necessary""" - print("schema", df.schema) + @staticmethod + def _get_dataframe_describe_stats(df: DataFrame) -> DataFrame: + """ + Gets a summary ``DataFrame`` with column-level statistics about the input ``DataFrame``. + :param df: Input ``DataFrame`` + :returns: Summary ``DataFrame`` with column-level statistics + """ src_fields = [fld.name for fld in df.schema.fields] - print("src_fields", src_fields) renamed_summary = False # get summary statistics handling the case where a field named 'summary' exists @@ -145,114 +172,127 @@ def _get_dataframe_describe_stats(self, df): return summary_df - def summarizeToDF(self): - """ Generate summary analysis of data set as dataframe + def summarizeToDF(self) -> DataFrame: + """ + Generates a summary analysis of the input ``DataFrame`` of the ``DataAnalyzer``. - :return: Summary results as dataframe + :returns: Summary ``DataFrame`` with analyzer results + .. note:: The resulting dataframe can be displayed with the ``display`` function in a notebook environment or with the ``show`` method. The output is also used in code generation to generate more accurate code. """ - self._df.cache().createOrReplaceTempView("data_analysis_summary") + self._df.createOrReplaceTempView("data_analysis_summary") total_count = self._df.count() * 1.0 - dtypes = self._df.dtypes - - # schema information - dfDataSummary = self._addMeasureToSummary( - 'schema', + data_summary_df = self._addMeasureToSummary( + measureName="schema", summaryExpr=f"""to_json(named_struct('column_count', {len(dtypes)}))""", fieldExprs=[f"'{dtype[1]}' as {dtype[0]}" for dtype in dtypes], - dfData=self._df) + dfData=self._df + ) - # count - dfDataSummary = self._addMeasureToSummary( - 'count', + data_summary_df = self._addMeasureToSummary( + measureName="count", summaryExpr=f"{total_count}", fieldExprs=[f"string(count({dtype[0]})) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) - - dfDataSummary = self._addMeasureToSummary( - 'null_probability', - fieldExprs=[f"""string( round( ({total_count} - count({dtype[0]})) /{total_count}, 2)) as {dtype[0]}""" - for dtype in dtypes], + dfSummary=data_summary_df + ) + + data_summary_df = self._addMeasureToSummary( + measureName="null_probability", + fieldExprs=[ + f"""string( round( ({total_count} - count({dtype[0]})) /{total_count}, 2)) as {dtype[0]}""" + for dtype in dtypes + ], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) # distinct count - dfDataSummary = self._addMeasureToSummary( - 'distinct_count', + data_summary_df = self._addMeasureToSummary( + measureName="distinct_count", summaryExpr="count(distinct *)", fieldExprs=[f"string(count(distinct {dtype[0]})) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) # min - dfDataSummary = self._addMeasureToSummary( - 'min', + data_summary_df = self._addMeasureToSummary( + measureName="min", fieldExprs=[f"string(min({dtype[0]})) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df) - dfDataSummary = self._addMeasureToSummary( - 'max', + data_summary_df = self._addMeasureToSummary( + measureName="max", fieldExprs=[f"string(max({dtype[0]})) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) - descriptionDf = (self._get_dataframe_describe_stats(self._df) - .where(f"{DATA_SUMMARY_FIELD_NAME} in ('mean', 'stddev')")) - describeData = descriptionDf.collect() + description_df = ( + self + ._get_dataframe_describe_stats(self._df) + .where(f"{DATA_SUMMARY_FIELD_NAME} in ('mean', 'stddev')") + ) + description_data = description_df.collect() - for row in describeData: + for row in description_data: measure = row[DATA_SUMMARY_FIELD_NAME] - values = {k[0]: '' for k in dtypes} + values = {k[0]: "" for k in dtypes} row_key_pairs = row.asDict() for k1 in row_key_pairs: values[k1] = str(row[k1]) - dfDataSummary = self._addMeasureToSummary( - measure, + data_summary_df = self._addMeasureToSummary( + measureName=measure, fieldExprs=[f"'{values[dtype[0]]}'" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) # string characteristics for strings and string representation of other values - dfDataSummary = self._addMeasureToSummary( - 'print_len_min', + data_summary_df = self._addMeasureToSummary( + measureName="print_len_min", fieldExprs=[f"string(min(length(string({dtype[0]})))) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) - dfDataSummary = self._addMeasureToSummary( - 'print_len_max', + data_summary_df = self._addMeasureToSummary( + measureName="print_len_max", fieldExprs=[f"string(max(length(string({dtype[0]})))) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=dfDataSummary) + dfSummary=data_summary_df + ) - return dfDataSummary + return data_summary_df - def summarize(self, suppressOutput=False): - """ Generate summary analysis of data set and return / print summary results + def summarize(self, suppressOutput: bool = False) -> str: + """ + Generates a summary analysis of the input ``DataFrame`` and returns the analysis as a string. Optionally prints + the summary analysis. - :param suppressOutput: If False, prints results to console also - :return: Summary results as string + :param suppressOutput: Whether to print the summary analysis (default `False`) + :return: Summary analysis as string """ - dfSummary = self.summarizeToDF() + summary_df = self.summarizeToDF() results = [ "Data set summary", "================" ] - for r in dfSummary.collect(): - results.append(self._displayRow(r)) + for row in summary_df.collect(): + results.append(self._displayRow(row)) summary = "\n".join([str(x) for x in results]) @@ -262,213 +302,229 @@ def summarize(self, suppressOutput=False): return summary @classmethod - def _valueFromSummary(cls, dataSummary, colName, measure, defaultValue): - """ Get value from data summary - - :param dataSummary: Data summary to search, optional - :param colName: Column name of column to get value for - :param measure: Measure name of measure to get value for - :param defaultValue: Default value if any other argument is not specified or value could not be found in - data summary - :return: Value from lookup or `defaultValue` if not found + def _valueFromSummary( + cls, + dataSummary: dict[str, dict[str, object]] | None = None, + colName: str | None = None, + measure: str | None = None, + defaultValue: int | float | str | None = None + ) -> object: + """ + Gets a measure value from a data summary given a measure name and column name. Returns a default value when the + measure value cannot be found. + + :param dataSummary: Optional data summary to search (if ``None``, the default value is returned) + :param colName: Optional column name + :param measure: Optional measure name + :param defaultValue: Default return value + :return: Measure value or default value """ - if dataSummary is not None and colName is not None and measure is not None: - if measure in dataSummary: - measureValues = dataSummary[measure] + if dataSummary is None or colName is None or measure is None: + return defaultValue - if colName in measureValues: - return measureValues[colName] + if measure not in dataSummary: + return defaultValue - # return default value if value could not be looked up or found - return defaultValue + measure_values = dataSummary[measure] + if colName not in measure_values: + return defaultValue + + return measure_values[colName] @classmethod - def _generatorDefaultAttributesFromType(cls, sqlType, colName=None, dataSummary=None, sourceDf=None): - """ Generate default set of attributes for each data type + def _generatorDefaultAttributesFromType( + cls, + sqlType: types.DataType, + colName: str | None = None, + dataSummary: dict | None = None + ) -> str: + """ + Generates a Spark SQL expression for the input column and data type. Optionally uses ``DataAnalyzer`` summary + statistics to create Spark SQL expressions for generating data similar to the input ``DataFrame``. - :param sqlType: Instance of `pyspark.sql.types.DataType` - :param colName: Name of column being generated - :param dataSummary: Map of maps of attributes from data summary, optional - :param sourceDf: Source dataframe to retrieve attributes of real data, optional - :return: Attribute string for supplied sqlType + :param sqlType: Data type as an instance of ``pyspark.sql.types.DataType`` + :param colName: Column name + :param dataSummary: Optional map of maps of attributes from the data summary + :return: Spark SQL expression for supplied column and data type - When generating code from a schema, we have no data heuristics to determine how data should be generated, - so goal is to just generate code that produces some data. + .. note:: + When generating expressions from a schema, no data heuristics are available to determine how data should be + generated. This method will use default values according to Spark's data type limits to generate working + expressions for data generation. Users are expected to modify the generated code to their needs. """ - assert isinstance(sqlType, DataType) + if not isinstance(sqlType, types.DataType): + raise ValueError( + f"Argument 'sqlType' with type {type(sqlType)} must be an instance of `pyspark.sql.types.DataType`" + ) - if sqlType == StringType(): + if sqlType == types.StringType(): result = """template=r'\\\\w'""" - elif sqlType in [IntegerType(), LongType()]: - minValue = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - maxValue = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000000) - result = f"""minValue={minValue}, maxValue={maxValue}""" - elif sqlType == ByteType(): - minValue = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - maxValue = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=127) - result = f"""minValue={minValue}, maxValue={maxValue}""" - elif sqlType == ShortType(): - minValue = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - maxValue = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=32767) - result = f"""minValue={minValue}, maxValue={maxValue}""" - elif sqlType == BooleanType(): + + elif sqlType in [types.IntegerType(), types.LongType()]: + min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000000) + result = f"""minValue={min_value}, maxValue={max_value}""" + + elif sqlType == types.ByteType(): + min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=127) + result = f"""minValue={min_value}, maxValue={max_value}""" + + elif sqlType == types.ShortType(): + min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=32767) + result = f"""minValue={min_value}, maxValue={max_value}""" + + elif sqlType == types.BooleanType(): result = """expr='id % 2 = 1'""" - elif sqlType == DateType(): + + elif sqlType == types.DateType(): result = """expr='current_date()'""" - elif isinstance(sqlType, DecimalType): - minValue = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - maxValue = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000) - result = f"""minValue={minValue}, maxValue={maxValue}""" - elif sqlType in [FloatType(), DoubleType()]: - minValue = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0.0) - maxValue = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000000.0) - result = f"""minValue={minValue}, maxValue={maxValue}, step=0.1""" - elif sqlType == TimestampType(): + + elif isinstance(sqlType, types.DecimalType): + min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000) + result = f"""minValue={min_value}, maxValue={max_value}""" + + elif sqlType in [types.FloatType(), types.DoubleType()]: + min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0.0) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000000.0) + result = f"""minValue={min_value}, maxValue={max_value}, step=0.1""" + + elif sqlType == types.TimestampType(): result = """begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute" """ - elif sqlType == BinaryType(): + + elif sqlType == types.BinaryType(): result = """expr="cast('dbldatagen generated synthetic data' as binary)" """ + else: result = """expr='null'""" - percentNullsValue = float(cls._valueFromSummary(dataSummary, colName, "null_probability", defaultValue=0.0)) + summary_value = cls._valueFromSummary(dataSummary, colName, "null_probability", defaultValue=0.0) + percent_nulls_value = ( + float(summary_value) if isinstance(summary_value, str | SupportsFloat | SupportsIndex) else 0.0 + ) - if percentNullsValue > 0.0: - result = result + f", percentNulls={percentNullsValue}" + if percent_nulls_value > 0.0: + result = result + f", percentNulls={percent_nulls_value}" return result @classmethod - def _scriptDataGeneratorCode(cls, schema, *, dataSummary=None, sourceDf=None, suppressOutput=False, name=None): + def _scriptDataGeneratorCode( + cls, + schema: types.StructType, + *, + dataSummary: dict | None = None, + sourceDf: DataFrame | None = None, + suppressOutput: bool = False, + name: str | None = None + ) -> str: """ - Generate outline data generator code from an existing dataframe - - This will generate a data generator spec from an existing dataframe. The resulting code - can be used to generate a data generation specification. + Generates code to build a ``DataGenerator`` from an existing dataframe. Analyzes the dataframe passed to the + constructor of the ``DataAnalyzer`` and returns a script for generating similar data. - Note at this point in time, the code generated is stub code only. - For most uses, it will require further modification - however it provides a starting point - for generation of the specification for a given data set. - - The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object. - - :param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema` - :param dataSummary: Map of maps of attributes from data summary, optional - :param sourceDf: Source dataframe to retrieve attributes of real data, optional - :param suppressOutput: Suppress printing of generated code if True + :param schema: Pyspark schema as a ``StructType`` + :param dataSummary: Optional map of maps of attributes from the data summary + :param sourceDf: Optional ``DataFrame`` to retrieve attributes from existing data + :param suppressOutput: Whether to suppress printing attributes during execution (default `False`) :param name: Optional name for data generator - :return: String containing skeleton code + :return: Data generation code string + .. note:: + Code generated by this method should be treated as experimental. For most uses, generated code requires further + modification. Results are intended to provide an initial script for generating data from the input dataset. """ - assert isinstance(schema, StructType), "expecting valid Pyspark Schema" - - stmts = [] + statements = [] if name is None: name = cls._DEFAULT_GENERATED_NAME - stmts.append(cls._GENERATED_COMMENT) - - stmts.append("import dbldatagen as dg") - stmts.append("import pyspark.sql.types") - - stmts.append(cls._GENERATED_FROM_SCHEMA_COMMENT) - - stmts.append(strip_margins( - f"""generation_spec = ( - | dg.DataGenerator(sparkSession=spark, - | name='{name}', + statements.append(cls._GENERATED_COMMENT) + statements.append("import dbldatagen as dg") + statements.append("import pyspark.sql.types") + statements.append(cls._GENERATED_FROM_SCHEMA_COMMENT) + statements.append( + strip_margins( + f"""generation_spec = ( + | dg.DataGenerator(sparkSession=spark, + | name='{name}', | rows=100000, | random=True, | )""", - '|')) + marginChar="|" + ) + ) indent = " " - for fld in schema.fields: - col_name = fld.name - col_type = fld.dataType.simpleString() - - if isinstance(fld.dataType, ArrayType): - col_type = fld.dataType.elementType.simpleString() - field_attributes = cls._generatorDefaultAttributesFromType(fld.dataType.elementType) # no data look up - array_attributes = """structType='array', numFeatures=(2,6)""" - name_and_type = f"""'{col_name}', '{col_type}'""" - stmts.append(indent + f""".withColumn({name_and_type}, {field_attributes}, {array_attributes})""") + for field in schema.fields: + column_name = field.name + column_type = field.dataType.simpleString() + + if isinstance(field.dataType, types.ArrayType): + column_type = field.dataType.elementType.simpleString() + field_attributes = cls._generatorDefaultAttributesFromType(field.dataType.elementType) + array_attributes = "structType='array', numFeatures=(2,6)" + name_and_type = f"'{column_name}', '{column_type}'" + statements.append(indent + f".withColumn({name_and_type}, {field_attributes}, {array_attributes})") else: - field_attributes = cls._generatorDefaultAttributesFromType(fld.dataType, - colName=col_name, - dataSummary=dataSummary, - sourceDf=sourceDf) - stmts.append(indent + f""".withColumn('{col_name}', '{col_type}', {field_attributes})""") - stmts.append(indent + ")") + field_attributes = cls._generatorDefaultAttributesFromType( + field.dataType, colName=column_name, dataSummary=dataSummary + ) + statements.append(indent + f".withColumn('{column_name}', '{column_type}', {field_attributes})") + statements.append(indent + ")") if not suppressOutput: - for line in stmts: + for line in statements: print(line) - return "\n".join(stmts) + return "\n".join(statements) @classmethod - def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None): + def scriptDataGeneratorFromSchema( + cls, schema: types.StructType, suppressOutput: bool = False, name: str | None = None + ) -> str: """ - Generate outline data generator code from an existing dataframe - - This will generate a data generator spec from an existing dataframe. The resulting code - can be used to generate a data generation specification. - - Note at this point in time, the code generated is stub code only. - For most uses, it will require further modification - however it provides a starting point - for generation of the specification for a given data set. - - The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object. + Generates code to build a ``DataGenerator`` from an existing dataframe schema. Analyzes the schema of the + ``DataFrame`` passed to the ``DataAnalyzer`` and returns a script for generating similar data. - :param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema` - :param suppressOutput: Suppress printing of generated code if True + :param schema: Pyspark schema as a ``StructType`` + :param suppressOutput: Whether to suppress printing attributes during execution (default `False`) :param name: Optional name for data generator - :return: String containing skeleton code + :return: Data generation code string + .. note:: + Code generated by this method should be treated as experimental. For most uses, generated code requires further + modification. Results are intended to provide an initial script for generating data from the input dataset. """ - return cls._scriptDataGeneratorCode(schema, - suppressOutput=suppressOutput, - name=name) + return cls._scriptDataGeneratorCode(schema, suppressOutput=suppressOutput, name=name) - def scriptDataGeneratorFromData(self, suppressOutput=False, name=None): + def scriptDataGeneratorFromData(self, suppressOutput: bool = False, name: str | None = None) -> str: """ - Generate outline data generator code from an existing dataframe + Generates code to build a ``DataGenerator`` from an existing dataframe. Analyzes statistical properties of the + ``DataFrame`` passed to the ``DataAnalyzer`` and returns a script for generating similar data. - This will generate a data generator spec from an existing dataframe. The resulting code - can be used to generate a data generation specification. - - Note at this point in time, the code generated is stub code only. - For most uses, it will require further modification - however it provides a starting point - for generation of the specification for a given data set - - The dataframe to be analyzed is the Spark dataframe passed to the constructor of the DataAnalyzer object - - :param suppressOutput: Suppress printing of generated code if True + :param suppressOutput: Whether to suppress printing attributes during execution (default `False`) :param name: Optional name for data generator - :return: String containing skeleton code + :return: Data generation code string + .. note:: + Code generated by this method should be treated as experimental. For most uses, generated code requires further + modification. Results are intended to provide an initial script for generating data from the input dataset. """ - assert self._df is not None - - if not isinstance(self._df, ssql.DataFrame): - self.logger.warning(strip_margins( - """The parameter `sourceDf` should be a valid Pyspark dataframe. - |Note this warning may false due to use of remote connection to a Spark cluster""", - '|')) + if not self._df: + raise ValueError("Missing `DataAnalyzer` property `df` for scripting a data generator from data") if self._dataSummary is None: df_summary = self.summarizeToDF() - self._dataSummary = {} + for row in df_summary.collect(): row_key_pairs = row.asDict() - self._dataSummary[row['measure_']] = row_key_pairs + self._dataSummary[row["measure_"]] = row_key_pairs - return self._scriptDataGeneratorCode(self._df.schema, - suppressOutput=suppressOutput, - name=name, - dataSummary=self._dataSummary, - sourceDf=self._df) + return self._scriptDataGeneratorCode( + self._df.schema, suppressOutput=suppressOutput, name=name, dataSummary=self._dataSummary, sourceDf=self._df + ) diff --git a/dbldatagen/text_generator_plugins.py b/dbldatagen/text_generator_plugins.py index 135d50eb..76a75f5b 100644 --- a/dbldatagen/text_generator_plugins.py +++ b/dbldatagen/text_generator_plugins.py @@ -8,19 +8,48 @@ import importlib import logging +from collections.abc import Callable +from types import ModuleType +from typing import Optional, Union -from .text_generators import TextGenerator -from .utils import DataGenError +import pandas as pd + +from dbldatagen.text_generators import TextGenerator +from dbldatagen.utils import DataGenError + + +class _FnCallContext: + """ + Inner class for storing context between function calls. + + initial instances of random number generators, clients for services etc here during execution + of the `initFn` calls + + :param txtGen: - reference to outer PyfnText object + """ + textGenerator: "TextGenerator" + + def __init__(self, txtGen: "TextGenerator") -> None: + self.textGenerator = txtGen + + def __setattr__(self, name: str, value: object) -> None: + """Allow dynamic attribute setting for plugin context.""" + super().__setattr__(name, value) + + def __getattr__(self, name: str) -> object: + """Allow dynamic attribute access for plugin context.""" + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") class PyfuncText(TextGenerator): # lgtm [py/missing-equals] - """ Text generator that supports generating text from arbitrary Python function + """ + Text generator that supports generating text from an arbitrary Python function. - :param fn: function to call to generate text. - :param init: function to call to initialize context - :param initPerBatch: if init per batch is set to True, initialization of context is performed on every Pandas udf - call. Default is False. - :param name: String representing name of text generator when converted to string via ``repr`` or ``str`` + :param fn: Python function which generates text + :param init: Python function which creates an initial context/state + :param initPerBatch: Whether to call the initialization function for each invocation of the Pandas UDF which + generates text (default `false`) + :param name: Optional name of the text generator when converted to string via ``repr`` or ``str`` The two functions define the plugin model @@ -39,70 +68,75 @@ class PyfuncText(TextGenerator): # lgtm [py/missing-equals] enclosing text generator. .. note:: - There are no expectations of repeatability of data generation when using external code - or external libraries to generate text. + There are no expectations of repeatability of data generation when using external code + or external libraries to generate text. - However, custom code can call the base class method to get a Numpy random - number generator instance. This will have been seeded using the ``dbldatagen`` - random number seed if one was specified, so random numbers generated from this will be repeatable. + However, custom code can call the base class method to get a Numpy random + number generator instance. This will have been seeded using the ``dbldatagen`` + random number seed if one was specified, so random numbers generated from this will be repeatable. - The custom code may call the property ``randomSeed`` on the text generator object to get the random seed - which may be used to seed library specific initialization. + The custom code may call the property ``randomSeed`` on the text generator object to get the random seed + which may be used to seed library specific initialization. - This random seed property may have the values ``None`` or ``-1`` which should be treated as meaning dont - use a random seed. + This random seed property may have the values ``None`` or ``-1`` which should be treated as meaning dont + use a random seed. - The code does not guarantee thread or cross process safety. If a new instance of the random number - generator is needed, you may call the base class method with the argument `forceNewInstance` set to True. + The code does not guarantee thread or cross process safety. If a new instance of the random number + generator is needed, you may call the base class method with the argument `forceNewInstance` set to True. """ - - class _FnCallContext: - """ inner class to support storage of context between calls - - initial instances of random number generators, clients for services etc here during execution - of the `initFn` calls - - :param txtGen: - reference to outer PyfnText object - - """ - - def __init__(self, txtGen): - self.textGenerator = txtGen - - def __init__(self, fn, *, init=None, initPerBatch=False, name=None, rootProperty=None): + _name: str + _initPerBatch: bool + _rootProperty: object + _pyFn: Callable + _initFn: Union[Callable, None] + _context: Union[_FnCallContext, None] + + def __init__( + self, + fn: Callable, + *, + init: Union[Callable, None] = None, + initPerBatch: bool = False, + name: Union[str, None] = None, + rootProperty: object = None + ) -> None: super().__init__() - assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)" - assert init is None or callable(init), "Init function must be a callable function or lambda if passed" + if not callable(fn): + raise ValueError("Function must be provided with signature fn(context, oldValue)") + + if init and not callable(init): + raise ValueError("Init function must be a callable function or lambda if passed") # if root property is provided, root property will be passed to generate text function self._rootProperty = rootProperty - self._pyFn = fn # generate text function self._initFn = init # context initialization function self._context = None # context used to hold library root object and other properties # if init per batch is True, initialization of context will be per UDF call - assert initPerBatch in [True, False], "initPerBatch must evaluate to boolean True or False" - self._initPerBatch = initPerBatch + if not isinstance(initPerBatch, bool): + raise ValueError("initPerBatch must evaluate to boolean True or False") + self._initPerBatch = initPerBatch self._name = name if name is not None else "PyfuncText" - def __str__(self): - """ Get string representation of object - ``name`` property is used to provide user friendly name for text generator + def __str__(self) -> str: """ - return f"{self._name}({repr(self._pyFn)}, init={self._initFn})" + Gets a string representation of the text generator using the ``name`` property. - def _getContext(self, forceNewInstance=False): - """ Get the context for plugin function calls + :returns: String representation of the text generator + """ + return f"{self._name}({self._pyFn!r}, init={self._initFn})" - :param forceNewInstance: if True, forces each call to create a new context - :return: existing or newly created context. + def _getContext(self, forceNewInstance: bool = False) -> _FnCallContext: + """ + Gets the context for plugin function calls. + :param forceNewInstance: Whether to create a new context for each call (default `False`) + :return: Existing or new context for plugin function calls """ - context = self._context - if context is None or forceNewInstance: - context = PyfuncText._FnCallContext(self) + if self._context is None or forceNewInstance: + context = _FnCallContext(self) # init context using context creator if any provided if self._initFn is not None: @@ -113,41 +147,42 @@ def _getContext(self, forceNewInstance=False): self._context = context else: return context - return self._context - def pandasGenerateText(self, v): - """ Called to generate text via Pandas UDF mechanism + return self._context - :param v: base value of column as Pandas Series + def pandasGenerateText(self, v: pd.Series) -> pd.Series: + """ + Generates text from input columns using a Pandas UDF. + :param v: Input column values as Pandas Series + :returns: Generated text values as a Pandas Series or DataFrame """ # save object properties in local vars to avoid overhead of object dereferences # on every call context = self._getContext(self._initPerBatch) evalFn = self._pyFn - rootProperty = getattr(context, self._rootProperty) if self._rootProperty is not None else None + rootProperty = getattr(context, str(self._rootProperty), None) if self._rootProperty else None # define functions to call with context and with root property - def _valueFromFn(originalValue): + def _valueFromFn(originalValue: object) -> object: return evalFn(context, originalValue) - def _valueFromFnWithRoot(originalValue): + def _valueFromFnWithRoot(_: object) -> object: return evalFn(rootProperty) if rootProperty is not None: - results = v.apply(_valueFromFnWithRoot, args=None) - else: - results = v.apply(_valueFromFn, args=None) + return v.apply(_valueFromFnWithRoot) - return results + return v.apply(_valueFromFn) class PyfuncTextFactory: - """PyfuncTextFactory applies syntactic wrapping around creation of PyfuncText objects + """ + Applies syntactic wrapping around the creation of PyfuncText objects. - :param name: name of generated object (when converted to string via ``str``) + :param name: Generated object name (when converted to string via ``str``) - It allows the use of the following constructs: + This class allows the use of the following constructs: .. code-block:: python @@ -180,87 +215,100 @@ def initFaker(ctx): init=initFaker, rootProperty="faker", name="FakerText")) - """ + _name: str + _initPerBatch: bool + _initFn: Union[Callable, None] + _rootProperty: Union[object, None] - def __init__(self, name=None): - """ - - :param name: name of generated object (when converted to string via ``str``) - - """ + def __init__(self, name: Union[str, None] = None) -> None: self._initFn = None self._rootProperty = None self._name = "PyfuncText" if name is None else name self._initPerBatch = False - def withInit(self, fn): - """ Specifies context initialization function + def withInit(self, fn: Callable) -> "PyfuncTextFactory": + """ + Sets the initialization function for creating context. - :param fn: function pointer or lambda function for initialization - signature should ``initFunction(context)`` + :param fn: Callable function for initializing context; Signature should ``initFunction(context)`` + :returns: Modified text generation factory with the specified initialization function - .. note:: - This variation initializes the context once per worker process per text generator - instance. + .. note:: + This variation initializes the context once per worker process per text generator + instance. """ self._initFn = fn return self - def withInitPerBatch(self, fn): - """ Specifies context initialization function + def withInitPerBatch(self, fn: Callable) -> "PyfuncTextFactory": + """ + Sets the initialization function for creating context for each batch. - :param fn: function pointer or lambda function for initialization - signature should ``initFunction(context)`` + :param fn: Callable function for initializing context; Signature should ``initFunction(context)`` + :returns: Modified text generation factory with the specified initialization function called for each batch - .. note:: - This variation initializes the context once per internal pandas UDF call. - The UDF call will be called once per 10,000 rows if system is configured using defaults. - Setting the pandas batch size as an argument to the DataSpec creation will change the default - batch size. + .. note:: + This variation initializes the context once per internal pandas UDF call. + The UDF call will be called once per 10,000 rows if system is configured using defaults. + Setting the pandas batch size as an argument to the DataSpec creation will change the default + batch size. """ self._initPerBatch = True return self.withInit(fn) - def withRootProperty(self, prop): - """ If called, specifies the property of the context to be passed to the text generation function. - If not called, the context object itself will be passed to the text generation function. + def withRootProperty(self, prop: object) -> "PyfuncTextFactory": + """ + Sets the context property to be passed to the text generation function. If not called, the context object will + be passed to the text generation function. + + :param prop: Context property + :returns: Modified text generation factory with the context property """ self._rootProperty = prop return self - def __call__(self, evalFn, *args, isProperty=False, **kwargs): - """ Internal function call mechanism that implements the syntax expansion + def __call__( + self, + evalFn: Union[str, Callable], + *args, + isProperty: bool = False, + **kwargs + ) -> PyfuncText: + """ + Internal function calling mechanism that implements the syntax expansion. - :param evalFn: text generation function or lambda - :param args: optional args to be passed by position - :param kwargs: optional keyword args following Python keyword passing mechanism - :param isProperty: if true, interpret evalFn as string name of property, not a function or method + :param evalFn: Callable text generation function + :param args: Optional arguments to pass by position to the text generation function + :param kwargs: Optional keyword arguments following Python keyword passing mechanism + :param isProperty: Whether to interpret the evaluation function as string name of property instead of a callable + function (default `False`) """ assert evalFn is not None and (type(evalFn) is str or callable(evalFn)), "Function must be provided" - if type(evalFn) is str: - assert self._rootProperty is not None and len(self._rootProperty.strip()) > 0, \ - "string named functions can only be used on text generators with root property" - fnName = evalFn - if len(args) > 0 and len(kwargs) > 0: - # generate lambda with both kwargs and args - assert not isProperty, "isProperty cannot be true if using arguments" - evalFn = lambda root: getattr(root, fnName)(*args, **kwargs) - elif len(args) > 0: - # generate lambda with positional args - assert not isProperty, "isProperty cannot be true if using arguments" - evalFn = lambda root: getattr(root, fnName)(*args) - elif len(kwargs) > 0: - # generate lambda with keyword args - assert not isProperty, "isProperty cannot be true if using arguments" - evalFn = lambda root: getattr(root, fnName)(**kwargs) - elif isProperty: - # generate lambda with property access, not method call - evalFn = lambda root: getattr(root, fnName) - else: - # generate lambda with no args - evalFn = (lambda root: getattr(root, fnName)()) + if isinstance(evalFn, str): + if not self._rootProperty: + raise ValueError("String named functions can only be used on text generators with root property") + function_name = evalFn + + if (len(args) > 0 or len(kwargs) > 0) and isProperty: + raise ValueError("Argument 'isProperty' cannot be used when passing arguments") + + def generated_evalFn(root: object) -> object: + method = getattr(root, function_name) + + if isProperty: + return method + elif len(args) > 0 and len(kwargs) > 0: + return method(*args, **kwargs) + elif len(args) > 0: + return method(*args) + elif len(kwargs) > 0: + return method(**kwargs) + else: + return method() + + evalFn = generated_evalFn # returns the actual PyfuncText text generator object. # Note all syntax expansion is performed once only @@ -268,24 +316,31 @@ def __call__(self, evalFn, *args, isProperty=False, **kwargs): class FakerTextFactory(PyfuncTextFactory): - """ Factory object for Faker text generator flavored ``PyfuncText`` objects + """ + Factory for creating Faker text generators. - :param locale: list of locales. If empty, defaults to ``en-US`` - :param providers: list of providers - :param name: name of generated objects. Defaults to ``FakerText`` - :param lib: library import name of Faker library. If none passed, uses ``faker`` - :param rootClass: name of root object class If none passed, uses ``Faker`` + :param locale: Optional list of locales (default is ``["en-US"]``) + :param providers: List of providers + :param name: Optional name of generated objects (default is ``FakerText``) + :param lib: Optional import alias of Faker library (dfault is ``"faker"``) + :param rootClass: Optional name of the root object class (default is ``"Faker"``) ..note :: Both the library name and root object class can be overridden - this is primarily for internal testing purposes. """ - _FAKER_LIB = "faker" + _defaultFakerTextFactory: Optional["FakerTextFactory"] = None + _FAKER_LIB: str = "faker" - _defaultFakerTextFactory = None - - def __init__(self, *, locale=None, providers=None, name="FakerText", lib=None, - rootClass=None): + def __init__( + self, + *, + locale: Union[str, list[str], None] = None, + providers: Union[list, None] = None, + name: str = "FakerText", + lib: Union[str, None] = None, + rootClass: Union[str, None] = None + ) -> None: super().__init__(name) @@ -304,37 +359,42 @@ def __init__(self, *, locale=None, providers=None, name="FakerText", lib=None, self._rootObjectClass = rootClass # load the library - fakerModule = self._loadLibrary(lib) + faker_module = self._loadLibrary(lib) # make the initialization function - initFn = self._mkInitFn(fakerModule, locale, providers) + init_function = self._mkInitFn(faker_module, locale, providers) - self.withInit(initFn) + self.withInit(init_function) self.withRootProperty("faker") @classmethod - def _getDefaultFactory(cls, lib=None, rootClass=None): - """Class method to get default faker text factory + def _getDefaultFactory(cls, lib: Union[str, None] = None, rootClass: Union[str, None] = None) -> "FakerTextFactory": + """ + Gets a default faker text factory. - Not intended for general use + :param lib: Optional import alias of Faker library (dfault is ``"faker"``) + :param rootClass: Optional name of the root object class (default is ``"Faker"``) """ if cls._defaultFakerTextFactory is None: cls._defaultFakerTextFactory = FakerTextFactory(lib=lib, rootClass=rootClass) return cls._defaultFakerTextFactory - def _mkInitFn(self, libModule, locale, providers): - """ Make Faker initialization function + def _mkInitFn(self, libModule: object, locale: Union[str, list[str], None], providers: Union[list, None]) -> Callable: + """ + Creates a Faker initialization function. - :param locale: locale string or list of locale strings - :param providers: providers to load - :return: + :param libModule: Faker module + :param locale: Locale string or list of locale strings (e.g. "en-us") + :param providers: List of Faker providers to load + :returns: Callable initialization function """ - assert libModule is not None, "must have a valid loaded Faker library module" + if libModule is None: + raise ValueError("must have a valid loaded Faker library module") fakerClass = getattr(libModule, self._rootObjectClass) # define the initialization function for Faker - def fakerInitFn(ctx): + def fakerInitFn(ctx: _FnCallContext) -> None: if locale is not None: ctx.faker = fakerClass(locale=locale) else: @@ -342,44 +402,52 @@ def fakerInitFn(ctx): if providers is not None: for provider in providers: - ctx.faker.add_provider(provider) + ctx.faker.add_provider(provider) # type: ignore[attr-defined] return fakerInitFn - def _loadLibrary(self, lib): - """ Load faker library if not already loaded + def _loadLibrary(self, lib: str) -> ModuleType: + """ + Loads the faker library. - :param lib: library name of Faker library. If none passed, uses ``faker`` + :param lib: Optional alias name for Faker library (default is ``"faker"``) """ - # load library try: if lib is not None: - assert type(lib) is str and len(lib.strip()), f"Library ``{lib}`` must be a valid library name" + if not isinstance(lib, str): + raise ValueError(f"Input Faker alias with type '{type(lib)}' must be of type 'str'") + + if not lib: + raise ValueError("Input Faker alias must be provided") if lib in globals(): - return globals()[lib] + module = globals()[lib] + if isinstance(module, ModuleType): + return module + else: + raise ValueError(f"Global '{lib}' is not a module") + else: fakerModule = importlib.import_module(lib) globals()[lib] = fakerModule return fakerModule - except RuntimeError as err: - # pylint: disable=raise-missing-from - raise DataGenError("Could not load or initialize Faker library", err) - + else: + raise ValueError("Library name must be provided") -def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs): - """Generate faker text generator object using default FakerTextFactory - instance + except RuntimeError as err: + raise DataGenError("Could not load or initialize Faker library") from err - :param mname: method name to invoke - :param args: positional args to be passed to underlying Faker instance - :param _lib: internal only param - library to load - :param _rootClass: internal only param - root class to create - - :returns : instance of PyfuncText for use with Faker - ``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")`` +def fakerText(mname: str, *args, _lib: Union[str, None] = None, _rootClass: Union[str, None] = None, **kwargs) -> PyfuncText: + """ + Creates a faker text generator object using the default ``FakerTextFactory`` instance. Calling this method is + equivalent to calling ``FakerTextFactory()("sentence")``. + + :param mname: Method name to invoke + :param args: Positional argumentss to pass to the Faker text generation method + :param _lib: Optional import alias of Faker library (dfault is ``"faker"``) + :param _rootClass: Optional name of the root object class (default is ``"Faker"``) + :returns : ``PyfuncText`` for use with Faker """ - defaultFactory = FakerTextFactory._getDefaultFactory(lib=_lib, - rootClass=_rootClass) - return defaultFactory(mname, *args, **kwargs) # pylint: disable=not-callable + default_factory = FakerTextFactory._getDefaultFactory(lib=_lib, rootClass=_rootClass) + return default_factory(mname, *args, **kwargs) # pylint: disable=not-callable diff --git a/pyproject.toml b/pyproject.toml index a8eb3e29..aab70aa9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,13 +142,11 @@ exclude = [ "dbldatagen/__init__.py", "dbldatagen/column_generation_spec.py", "dbldatagen/column_spec_options.py", - "dbldatagen/data_analyzer.py", "dbldatagen/datagen_constants.py", "dbldatagen/datarange.py", "dbldatagen/daterange.py", "dbldatagen/nrange.py", "dbldatagen/schema_parser.py", - "dbldatagen/text_generator_plugins.py", ] [tool.ruff.lint] @@ -223,14 +221,12 @@ ignore = [ "dbldatagen/__init__.py", "dbldatagen/column_generation_spec.py", "dbldatagen/column_spec_options.py", - "dbldatagen/data_analyzer.py", "dbldatagen/datagen_constants.py", "dbldatagen/datarange.py", "dbldatagen/daterange.py", "dbldatagen/nrange.py", "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", - "dbldatagen/text_generator_plugins.py", "dbldatagen/utils.py" ] @@ -258,7 +254,6 @@ ignore-paths = [ "dbldatagen/__init__.py", "dbldatagen/column_generation_spec.py", "dbldatagen/column_spec_options.py", - "dbldatagen/data_analyzer.py", "dbldatagen/data_generator.py", "dbldatagen/datagen_constants.py", "dbldatagen/datarange.py", @@ -266,7 +261,6 @@ ignore-paths = [ "dbldatagen/nrange.py", "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", - "dbldatagen/text_generator_plugins.py", "dbldatagen/utils.py" ] @@ -389,14 +383,12 @@ exclude = [ "dbldatagen/__init__.py", "dbldatagen/column_generation_spec.py", "dbldatagen/column_spec_options.py", - "dbldatagen/data_analyzer.py", "dbldatagen/datagen_constants.py", "dbldatagen/datarange.py", "dbldatagen/daterange.py", "dbldatagen/nrange.py", "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", - "dbldatagen/text_generator_plugins.py", "dbldatagen/utils.py" ] warn_return_any = true From ff5d48330447b762dbef9c81bd0dfe7689241fc0 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Wed, 15 Oct 2025 18:41:11 -0400 Subject: [PATCH 2/4] Update formatting and fix docstrings --- dbldatagen/data_analyzer.py | 2 +- dbldatagen/text_generator_plugins.py | 32 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py index 58d1e501..60aea67b 100644 --- a/dbldatagen/data_analyzer.py +++ b/dbldatagen/data_analyzer.py @@ -24,7 +24,7 @@ class DataAnalyzer: """ This class is used to analyze an existing dataset to assist in generating a test data set with similar data - characteristics. Analyzer results and to generate code from existing schemas and data. + characteristics. Analyzer results can be used to generate code from existing schemas and data. :param df: Spark ``DataFrame`` to analyze :param sparkSession: ``SparkSession`` to use diff --git a/dbldatagen/text_generator_plugins.py b/dbldatagen/text_generator_plugins.py index 76a75f5b..3c4b7d62 100644 --- a/dbldatagen/text_generator_plugins.py +++ b/dbldatagen/text_generator_plugins.py @@ -10,7 +10,7 @@ import logging from collections.abc import Callable from types import ModuleType -from typing import Optional, Union +from typing import Optional import pandas as pd @@ -88,16 +88,16 @@ class PyfuncText(TextGenerator): # lgtm [py/missing-equals] _initPerBatch: bool _rootProperty: object _pyFn: Callable - _initFn: Union[Callable, None] - _context: Union[_FnCallContext, None] + _initFn: Callable | None + _context: _FnCallContext | None def __init__( self, fn: Callable, *, - init: Union[Callable, None] = None, + init: Callable | None = None, initPerBatch: bool = False, - name: Union[str, None] = None, + name: str | None = None, rootProperty: object = None ) -> None: super().__init__() @@ -218,10 +218,10 @@ def initFaker(ctx): """ _name: str _initPerBatch: bool - _initFn: Union[Callable, None] - _rootProperty: Union[object, None] + _initFn: Callable | None + _rootProperty: object | None - def __init__(self, name: Union[str, None] = None) -> None: + def __init__(self, name: str | None = None) -> None: self._initFn = None self._rootProperty = None self._name = "PyfuncText" if name is None else name @@ -270,7 +270,7 @@ def withRootProperty(self, prop: object) -> "PyfuncTextFactory": def __call__( self, - evalFn: Union[str, Callable], + evalFn: str | Callable, *args, isProperty: bool = False, **kwargs @@ -335,11 +335,11 @@ class FakerTextFactory(PyfuncTextFactory): def __init__( self, *, - locale: Union[str, list[str], None] = None, - providers: Union[list, None] = None, + locale: str | list[str] | None = None, + providers: list | None = None, name: str = "FakerText", - lib: Union[str, None] = None, - rootClass: Union[str, None] = None + lib: str | None = None, + rootClass: str | None = None ) -> None: super().__init__(name) @@ -368,7 +368,7 @@ def __init__( self.withRootProperty("faker") @classmethod - def _getDefaultFactory(cls, lib: Union[str, None] = None, rootClass: Union[str, None] = None) -> "FakerTextFactory": + def _getDefaultFactory(cls, lib: str | None = None, rootClass: str | None = None) -> "FakerTextFactory": """ Gets a default faker text factory. @@ -379,7 +379,7 @@ def _getDefaultFactory(cls, lib: Union[str, None] = None, rootClass: Union[str, cls._defaultFakerTextFactory = FakerTextFactory(lib=lib, rootClass=rootClass) return cls._defaultFakerTextFactory - def _mkInitFn(self, libModule: object, locale: Union[str, list[str], None], providers: Union[list, None]) -> Callable: + def _mkInitFn(self, libModule: object, locale: str | list[str] | None, providers: list | None) -> Callable: """ Creates a Faker initialization function. @@ -438,7 +438,7 @@ def _loadLibrary(self, lib: str) -> ModuleType: raise DataGenError("Could not load or initialize Faker library") from err -def fakerText(mname: str, *args, _lib: Union[str, None] = None, _rootClass: Union[str, None] = None, **kwargs) -> PyfuncText: +def fakerText(mname: str, *args, _lib: str | None = None, _rootClass: str | None = None, **kwargs) -> PyfuncText: """ Creates a faker text generator object using the default ``FakerTextFactory`` instance. Calling this method is equivalent to calling ``FakerTextFactory()("sentence")``. From 2e5afe7f5e615dfa439985f0c7fa803b9857ae99 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Fri, 17 Oct 2025 08:56:12 -0400 Subject: [PATCH 3/4] Update inferred max value for DecimalType columns --- dbldatagen/data_analyzer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py index 60aea67b..65c16b2a 100644 --- a/dbldatagen/data_analyzer.py +++ b/dbldatagen/data_analyzer.py @@ -227,7 +227,8 @@ def summarizeToDF(self) -> DataFrame: measureName="min", fieldExprs=[f"string(min({dtype[0]})) as {dtype[0]}" for dtype in dtypes], dfData=self._df, - dfSummary=data_summary_df) + dfSummary=data_summary_df + ) data_summary_df = self._addMeasureToSummary( measureName="max", @@ -384,8 +385,9 @@ def _generatorDefaultAttributesFromType( result = """expr='current_date()'""" elif isinstance(sqlType, types.DecimalType): + max_decimal_value = 10**(sqlType.precision - sqlType.scale) - 10**(-1 * sqlType.scale) min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=max_decimal_value) result = f"""minValue={min_value}, maxValue={max_value}""" elif sqlType in [types.FloatType(), types.DoubleType()]: From 089937644f576d52807a514719882d96cc5a7fc5 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Sun, 19 Oct 2025 22:01:49 -0400 Subject: [PATCH 4/4] Update docstrings and inferred max value for DecimalType columns --- dbldatagen/data_analyzer.py | 7 +++---- dbldatagen/text_generator_plugins.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py index 65c16b2a..5dcaaf92 100644 --- a/dbldatagen/data_analyzer.py +++ b/dbldatagen/data_analyzer.py @@ -125,7 +125,7 @@ def _addMeasureToSummary( :param dfData: Source ``DataFrame`` to summarize :param rowLimit: Number of rows to use for ``DataFrame`` summarization :param dfSummary: Summary metrics ``DataFrame`` - :return: Summary metrics ``DataFrame`` with the added measure + :returns: Summary metrics ``DataFrame`` with the added measure """ if dfData is None: raise ValueError("Input DataFrame `dfData` must be supplied when adding measures to a summary") @@ -384,10 +384,9 @@ def _generatorDefaultAttributesFromType( elif sqlType == types.DateType(): result = """expr='current_date()'""" - elif isinstance(sqlType, types.DecimalType): - max_decimal_value = 10**(sqlType.precision - sqlType.scale) - 10**(-1 * sqlType.scale) + elif sqlType == types.DecimalType(): min_value = cls._valueFromSummary(dataSummary, colName, "min", defaultValue=0) - max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=max_decimal_value) + max_value = cls._valueFromSummary(dataSummary, colName, "max", defaultValue=1000000.0) result = f"""minValue={min_value}, maxValue={max_value}""" elif sqlType in [types.FloatType(), types.DoubleType()]: diff --git a/dbldatagen/text_generator_plugins.py b/dbldatagen/text_generator_plugins.py index 3c4b7d62..11cf8c61 100644 --- a/dbldatagen/text_generator_plugins.py +++ b/dbldatagen/text_generator_plugins.py @@ -445,7 +445,7 @@ def fakerText(mname: str, *args, _lib: str | None = None, _rootClass: str | None :param mname: Method name to invoke :param args: Positional argumentss to pass to the Faker text generation method - :param _lib: Optional import alias of Faker library (dfault is ``"faker"``) + :param _lib: Optional import alias of Faker library (default is ``"faker"``) :param _rootClass: Optional name of the root object class (default is ``"Faker"``) :returns : ``PyfuncText`` for use with Faker """