Skip to main content
Version: 0.17.23

SparkS3Datasource

class great_expectations.datasource.fluent.SparkS3Datasource(*, type: Literal['spark_s3'] = 'spark_s3', name: str, id: Optional[uuid.UUID] = None, assets: List[Union[great_expectations.datasource.fluent.spark_file_path_datasource.CSVAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryCSVAsset, great_expectations.datasource.fluent.spark_file_path_datasource.ParquetAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryParquetAsset, great_expectations.datasource.fluent.spark_file_path_datasource.ORCAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryORCAsset, great_expectations.datasource.fluent.spark_file_path_datasource.JSONAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryJSONAsset, great_expectations.datasource.fluent.spark_file_path_datasource.TextAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryTextAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DeltaAsset, great_expectations.datasource.fluent.spark_file_path_datasource.DirectoryDeltaAsset]] = [], spark_config: Optional[Dict[pydantic.v1.types.StrictStr, Union[pydantic.v1.types.StrictStr, pydantic.v1.types.StrictInt, pydantic.v1.types.StrictFloat, pydantic.v1.types.StrictBool]]] = None, force_reuse_spark_context: bool = True, persist: bool = True, bucket: str, boto3_options: Dict[str, Union[great_expectations.datasource.fluent.config_str.ConfigStr, Any]] = )#
add_csv_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, schema: Optional[Union[great_expectations.datasource.fluent.serializable_types.pyspark.SerializableStructType, str]] = None, sep: Optional[str] = None, encoding: Optional[str] = None, quote: Optional[str] = None, escape: Optional[str] = None, comment: Optional[str] = None, header: Optional[Union[bool, str]] = None, inferSchema: Optional[Union[bool, str]] = None, ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None, ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None, nullValue: Optional[str] = None, nanValue: Optional[str] = None, positiveInf: Optional[str] = None, negativeInf: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, maxColumns: Optional[Union[int, str]] = None, maxCharsPerColumn: Optional[Union[int, str]] = None, maxMalformedLogPerPartition: Optional[Union[int, str]] = None, mode: Optional[Literal['PERMISSIVE', 'DROPMALFORMED', 'FAILFAST']] = None, columnNameOfCorruptRecord: Optional[str] = None, multiLine: Optional[Union[bool, str]] = None, charToEscapeQuoteEscaping: Optional[str] = None, samplingRatio: Optional[Union[float, str]] = None, enforceSchema: Optional[Union[bool, str]] = None, emptyValue: Optional[str] = None, locale: Optional[str] = None, lineSep: Optional[str] = None, unescapedQuoteHandling: Optional[Literal['STOP_AT_CLOSING_QUOTE', 'BACK_TO_DELIMITER', 'STOP_AT_DELIMITER', 'SKIP_VALUE', 'RAISE_ERROR']] = None) pydantic.BaseModel#
add_delta_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, timestampAsOf: Optional[str] = None, versionAsOf: Optional[str] = None) pydantic.BaseModel#
add_directory_csv_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, schema: Optional[Union[great_expectations.datasource.fluent.serializable_types.pyspark.SerializableStructType, str]] = None, sep: Optional[str] = None, encoding: Optional[str] = None, quote: Optional[str] = None, escape: Optional[str] = None, comment: Optional[str] = None, header: Optional[Union[bool, str]] = None, inferSchema: Optional[Union[bool, str]] = None, ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None, ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None, nullValue: Optional[str] = None, nanValue: Optional[str] = None, positiveInf: Optional[str] = None, negativeInf: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, maxColumns: Optional[Union[int, str]] = None, maxCharsPerColumn: Optional[Union[int, str]] = None, maxMalformedLogPerPartition: Optional[Union[int, str]] = None, mode: Optional[Literal['PERMISSIVE', 'DROPMALFORMED', 'FAILFAST']] = None, columnNameOfCorruptRecord: Optional[str] = None, multiLine: Optional[Union[bool, str]] = None, charToEscapeQuoteEscaping: Optional[str] = None, samplingRatio: Optional[Union[float, str]] = None, enforceSchema: Optional[Union[bool, str]] = None, emptyValue: Optional[str] = None, locale: Optional[str] = None, lineSep: Optional[str] = None, unescapedQuoteHandling: Optional[Literal['STOP_AT_CLOSING_QUOTE', 'BACK_TO_DELIMITER', 'STOP_AT_DELIMITER', 'SKIP_VALUE', 'RAISE_ERROR']] = None, data_directory: pathlib.Path) pydantic.BaseModel#
add_directory_delta_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, timestampAsOf: Optional[str] = None, versionAsOf: Optional[str] = None, data_directory: pathlib.Path) pydantic.BaseModel#
add_directory_json_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, schema: Optional[Union[great_expectations.datasource.fluent.serializable_types.pyspark.SerializableStructType, str]] = None, primitivesAsString: Optional[Union[bool, str]] = None, prefersDecimal: Optional[Union[bool, str]] = None, allowComments: Optional[Union[bool, str]] = None, allowUnquotedFieldNames: Optional[Union[bool, str]] = None, allowSingleQuotes: Optional[Union[bool, str]] = None, allowNumericLeadingZero: Optional[Union[bool, str]] = None, allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None, mode: Optional[Literal['PERMISSIVE', 'DROPMALFORMED', 'FAILFAST']] = None, columnNameOfCorruptRecord: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, multiLine: Optional[Union[bool, str]] = None, allowUnquotedControlChars: Optional[Union[bool, str]] = None, lineSep: Optional[str] = None, samplingRatio: Optional[Union[float, str]] = None, dropFieldIfAllNull: Optional[Union[bool, str]] = None, encoding: Optional[str] = None, locale: Optional[str] = None, allowNonNumericNumbers: Optional[Union[bool, str]] = None, data_directory: pathlib.Path) pydantic.BaseModel#
add_directory_orc_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, mergeSchema: Optional[Union[bool, str]] = False, data_directory: pathlib.Path) pydantic.BaseModel#
add_directory_parquet_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, mergeSchema: Optional[Union[bool, str]] = None, datetimeRebaseMode: Optional[Literal['EXCEPTION', 'CORRECTED', 'LEGACY']] = None, int96RebaseMode: Optional[Literal['EXCEPTION', 'CORRECTED', 'LEGACY']] = None, data_directory: pathlib.Path) pydantic.BaseModel#
add_directory_text_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, wholetext: bool = False, lineSep: Optional[str] = None, data_directory: pathlib.Path) pydantic.BaseModel#
add_json_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, schema: Optional[Union[great_expectations.datasource.fluent.serializable_types.pyspark.SerializableStructType, str]] = None, primitivesAsString: Optional[Union[bool, str]] = None, prefersDecimal: Optional[Union[bool, str]] = None, allowComments: Optional[Union[bool, str]] = None, allowUnquotedFieldNames: Optional[Union[bool, str]] = None, allowSingleQuotes: Optional[Union[bool, str]] = None, allowNumericLeadingZero: Optional[Union[bool, str]] = None, allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None, mode: Optional[Literal['PERMISSIVE', 'DROPMALFORMED', 'FAILFAST']] = None, columnNameOfCorruptRecord: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, multiLine: Optional[Union[bool, str]] = None, allowUnquotedControlChars: Optional[Union[bool, str]] = None, lineSep: Optional[str] = None, samplingRatio: Optional[Union[float, str]] = None, dropFieldIfAllNull: Optional[Union[bool, str]] = None, encoding: Optional[str] = None, locale: Optional[str] = None, allowNonNumericNumbers: Optional[Union[bool, str]] = None) pydantic.BaseModel#
add_orc_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, mergeSchema: Optional[Union[bool, str]] = False) pydantic.BaseModel#
add_parquet_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, mergeSchema: Optional[Union[bool, str]] = None, datetimeRebaseMode: Optional[Literal['EXCEPTION', 'CORRECTED', 'LEGACY']] = None, int96RebaseMode: Optional[Literal['EXCEPTION', 'CORRECTED', 'LEGACY']] = None) pydantic.BaseModel#
add_text_asset(name: str, *, id: Optional[uuid.UUID] = None, order_by: List[great_expectations.datasource.fluent.interfaces.Sorter] = None, batch_metadata: Dict[str, Any] = None, batching_regex: Pattern = re.compile('.*'), connect_options: Mapping = None, splitter: Optional[Union[great_expectations.datasource.fluent.spark_generic_splitters.SplitterColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterMultiColumnValue, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDividedInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterModInteger, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYear, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonth, great_expectations.datasource.fluent.spark_generic_splitters.SplitterYearAndMonthAndDay, great_expectations.datasource.fluent.spark_generic_splitters.SplitterDatetimePart]] = None, pathGlobFilter: Optional[Union[bool, str]] = None, recursiveFileLookup: Optional[Union[bool, str]] = None, modifiedBefore: Optional[Union[bool, str]] = None, modifiedAfter: Optional[Union[bool, str]] = None, wholetext: bool = False, lineSep: Optional[str] = None) pydantic.BaseModel#