Spark SQL#
This page gives an overview of all public Spark SQL API.
- Core Classes
- pyspark.sql.SparkSession
 - pyspark.sql.Catalog
 - pyspark.sql.DataFrame
 - pyspark.sql.Column
 - pyspark.sql.Observation
 - pyspark.sql.Row
 - pyspark.sql.GroupedData
 - pyspark.sql.PandasCogroupedOps
 - pyspark.sql.DataFrameNaFunctions
 - pyspark.sql.DataFrameStatFunctions
 - pyspark.sql.Window
 - pyspark.sql.DataFrameReader
 - pyspark.sql.DataFrameWriter
 - pyspark.sql.DataFrameWriterV2
 - pyspark.sql.UDFRegistration
 - pyspark.sql.UDTFRegistration
 - pyspark.sql.udf.UserDefinedFunction
 - pyspark.sql.udtf.UserDefinedTableFunction
 - pyspark.sql.datasource.DataSource
 - pyspark.sql.datasource.DataSourceReader
 - pyspark.sql.datasource.DataSourceStreamReader
 - pyspark.sql.datasource.DataSourceWriter
 - pyspark.sql.datasource.DataSourceRegistration
 - pyspark.sql.datasource.InputPartition
 - pyspark.sql.datasource.WriterCommitMessage
 - pyspark.sql.tvf.TableValuedFunction
 - pyspark.sql.VariantVal
 - pyspark.sql.table_arg.TableArg
 
 - Spark Session
- pyspark.sql.SparkSession.active
 - pyspark.sql.SparkSession.builder.appName
 - pyspark.sql.SparkSession.builder.config
 - pyspark.sql.SparkSession.builder.enableHiveSupport
 - pyspark.sql.SparkSession.builder.getOrCreate
 - pyspark.sql.SparkSession.builder.master
 - pyspark.sql.SparkSession.builder.remote
 - pyspark.sql.SparkSession.addArtifact
 - pyspark.sql.SparkSession.addArtifacts
 - pyspark.sql.SparkSession.addTag
 - pyspark.sql.SparkSession.catalog
 - pyspark.sql.SparkSession.clearTags
 - pyspark.sql.SparkSession.conf
 - pyspark.sql.SparkSession.createDataFrame
 - pyspark.sql.SparkSession.dataSource
 - pyspark.sql.SparkSession.getActiveSession
 - pyspark.sql.SparkSession.getTags
 - pyspark.sql.SparkSession.interruptAll
 - pyspark.sql.SparkSession.interruptOperation
 - pyspark.sql.SparkSession.interruptTag
 - pyspark.sql.SparkSession.newSession
 - pyspark.sql.SparkSession.profile
 - pyspark.sql.SparkSession.removeTag
 - pyspark.sql.SparkSession.range
 - pyspark.sql.SparkSession.read
 - pyspark.sql.SparkSession.readStream
 - pyspark.sql.SparkSession.sparkContext
 - pyspark.sql.SparkSession.sql
 - pyspark.sql.SparkSession.stop
 - pyspark.sql.SparkSession.streams
 - pyspark.sql.SparkSession.table
 - pyspark.sql.SparkSession.tvf
 - pyspark.sql.SparkSession.udf
 - pyspark.sql.SparkSession.udtf
 - pyspark.sql.SparkSession.version
 - pyspark.sql.is_remote
 - Spark Connect Only
 
 - Configuration
 - Input/Output
- pyspark.sql.DataFrameReader.csv
 - pyspark.sql.DataFrameReader.format
 - pyspark.sql.DataFrameReader.jdbc
 - pyspark.sql.DataFrameReader.json
 - pyspark.sql.DataFrameReader.load
 - pyspark.sql.DataFrameReader.option
 - pyspark.sql.DataFrameReader.options
 - pyspark.sql.DataFrameReader.orc
 - pyspark.sql.DataFrameReader.parquet
 - pyspark.sql.DataFrameReader.schema
 - pyspark.sql.DataFrameReader.table
 - pyspark.sql.DataFrameReader.text
 - pyspark.sql.DataFrameWriter.bucketBy
 - pyspark.sql.DataFrameWriter.csv
 - pyspark.sql.DataFrameWriter.format
 - pyspark.sql.DataFrameWriter.insertInto
 - pyspark.sql.DataFrameWriter.jdbc
 - pyspark.sql.DataFrameWriter.json
 - pyspark.sql.DataFrameWriter.mode
 - pyspark.sql.DataFrameWriter.option
 - pyspark.sql.DataFrameWriter.options
 - pyspark.sql.DataFrameWriter.orc
 - pyspark.sql.DataFrameWriter.parquet
 - pyspark.sql.DataFrameWriter.partitionBy
 - pyspark.sql.DataFrameWriter.save
 - pyspark.sql.DataFrameWriter.saveAsTable
 - pyspark.sql.DataFrameWriter.sortBy
 - pyspark.sql.DataFrameWriter.text
 - pyspark.sql.DataFrameWriterV2.using
 - pyspark.sql.DataFrameWriterV2.option
 - pyspark.sql.DataFrameWriterV2.options
 - pyspark.sql.DataFrameWriterV2.tableProperty
 - pyspark.sql.DataFrameWriterV2.partitionedBy
 - pyspark.sql.DataFrameWriterV2.create
 - pyspark.sql.DataFrameWriterV2.replace
 - pyspark.sql.DataFrameWriterV2.createOrReplace
 - pyspark.sql.DataFrameWriterV2.append
 - pyspark.sql.DataFrameWriterV2.overwrite
 - pyspark.sql.DataFrameWriterV2.overwritePartitions
 - pyspark.sql.MergeIntoWriter.whenMatched
 - pyspark.sql.MergeIntoWriter.whenNotMatched
 - pyspark.sql.MergeIntoWriter.whenNotMatchedBySource
 - pyspark.sql.MergeIntoWriter.withSchemaEvolution
 - pyspark.sql.MergeIntoWriter.merge
 
 - DataFrame
- pyspark.sql.DataFrame.__getattr__
 - pyspark.sql.DataFrame.__getitem__
 - pyspark.sql.DataFrame.agg
 - pyspark.sql.DataFrame.alias
 - pyspark.sql.DataFrame.approxQuantile
 - pyspark.sql.DataFrame.asTable
 - pyspark.sql.DataFrame.cache
 - pyspark.sql.DataFrame.checkpoint
 - pyspark.sql.DataFrame.coalesce
 - pyspark.sql.DataFrame.colRegex
 - pyspark.sql.DataFrame.collect
 - pyspark.sql.DataFrame.columns
 - pyspark.sql.DataFrame.corr
 - pyspark.sql.DataFrame.count
 - pyspark.sql.DataFrame.cov
 - pyspark.sql.DataFrame.createGlobalTempView
 - pyspark.sql.DataFrame.createOrReplaceGlobalTempView
 - pyspark.sql.DataFrame.createOrReplaceTempView
 - pyspark.sql.DataFrame.createTempView
 - pyspark.sql.DataFrame.crossJoin
 - pyspark.sql.DataFrame.crosstab
 - pyspark.sql.DataFrame.cube
 - pyspark.sql.DataFrame.describe
 - pyspark.sql.DataFrame.distinct
 - pyspark.sql.DataFrame.drop
 - pyspark.sql.DataFrame.dropDuplicates
 - pyspark.sql.DataFrame.dropDuplicatesWithinWatermark
 - pyspark.sql.DataFrame.drop_duplicates
 - pyspark.sql.DataFrame.dropna
 - pyspark.sql.DataFrame.dtypes
 - pyspark.sql.DataFrame.exceptAll
 - pyspark.sql.DataFrame.executionInfo
 - pyspark.sql.DataFrame.exists
 - pyspark.sql.DataFrame.explain
 - pyspark.sql.DataFrame.fillna
 - pyspark.sql.DataFrame.filter
 - pyspark.sql.DataFrame.first
 - pyspark.sql.DataFrame.foreach
 - pyspark.sql.DataFrame.foreachPartition
 - pyspark.sql.DataFrame.freqItems
 - pyspark.sql.DataFrame.groupBy
 - pyspark.sql.DataFrame.groupingSets
 - pyspark.sql.DataFrame.head
 - pyspark.sql.DataFrame.hint
 - pyspark.sql.DataFrame.inputFiles
 - pyspark.sql.DataFrame.intersect
 - pyspark.sql.DataFrame.intersectAll
 - pyspark.sql.DataFrame.isEmpty
 - pyspark.sql.DataFrame.isLocal
 - pyspark.sql.DataFrame.isStreaming
 - pyspark.sql.DataFrame.join
 - pyspark.sql.DataFrame.limit
 - pyspark.sql.DataFrame.lateralJoin
 - pyspark.sql.DataFrame.localCheckpoint
 - pyspark.sql.DataFrame.mapInPandas
 - pyspark.sql.DataFrame.mapInArrow
 - pyspark.sql.DataFrame.metadataColumn
 - pyspark.sql.DataFrame.melt
 - pyspark.sql.DataFrame.na
 - pyspark.sql.DataFrame.observe
 - pyspark.sql.DataFrame.offset
 - pyspark.sql.DataFrame.orderBy
 - pyspark.sql.DataFrame.persist
 - pyspark.sql.DataFrame.plot
 - pyspark.sql.DataFrame.printSchema
 - pyspark.sql.DataFrame.randomSplit
 - pyspark.sql.DataFrame.rdd
 - pyspark.sql.DataFrame.registerTempTable
 - pyspark.sql.DataFrame.repartition
 - pyspark.sql.DataFrame.repartitionByRange
 - pyspark.sql.DataFrame.replace
 - pyspark.sql.DataFrame.rollup
 - pyspark.sql.DataFrame.sameSemantics
 - pyspark.sql.DataFrame.sample
 - pyspark.sql.DataFrame.sampleBy
 - pyspark.sql.DataFrame.scalar
 - pyspark.sql.DataFrame.schema
 - pyspark.sql.DataFrame.select
 - pyspark.sql.DataFrame.selectExpr
 - pyspark.sql.DataFrame.semanticHash
 - pyspark.sql.DataFrame.show
 - pyspark.sql.DataFrame.sort
 - pyspark.sql.DataFrame.sortWithinPartitions
 - pyspark.sql.DataFrame.sparkSession
 - pyspark.sql.DataFrame.stat
 - pyspark.sql.DataFrame.storageLevel
 - pyspark.sql.DataFrame.subtract
 - pyspark.sql.DataFrame.summary
 - pyspark.sql.DataFrame.tail
 - pyspark.sql.DataFrame.take
 - pyspark.sql.DataFrame.to
 - pyspark.sql.DataFrame.toArrow
 - pyspark.sql.DataFrame.toDF
 - pyspark.sql.DataFrame.toJSON
 - pyspark.sql.DataFrame.toLocalIterator
 - pyspark.sql.DataFrame.toPandas
 - pyspark.sql.DataFrame.transform
 - pyspark.sql.DataFrame.transpose
 - pyspark.sql.DataFrame.union
 - pyspark.sql.DataFrame.unionAll
 - pyspark.sql.DataFrame.unionByName
 - pyspark.sql.DataFrame.unpersist
 - pyspark.sql.DataFrame.unpivot
 - pyspark.sql.DataFrame.where
 - pyspark.sql.DataFrame.withColumn
 - pyspark.sql.DataFrame.withColumns
 - pyspark.sql.DataFrame.withColumnRenamed
 - pyspark.sql.DataFrame.withColumnsRenamed
 - pyspark.sql.DataFrame.withMetadata
 - pyspark.sql.DataFrame.withWatermark
 - pyspark.sql.DataFrame.write
 - pyspark.sql.DataFrame.writeStream
 - pyspark.sql.DataFrame.writeTo
 - pyspark.sql.DataFrame.mergeInto
 - pyspark.sql.DataFrame.pandas_api
 - pyspark.sql.DataFrameNaFunctions.drop
 - pyspark.sql.DataFrameNaFunctions.fill
 - pyspark.sql.DataFrameNaFunctions.replace
 - pyspark.sql.DataFrameStatFunctions.approxQuantile
 - pyspark.sql.DataFrameStatFunctions.corr
 - pyspark.sql.DataFrameStatFunctions.cov
 - pyspark.sql.DataFrameStatFunctions.crosstab
 - pyspark.sql.DataFrameStatFunctions.freqItems
 - pyspark.sql.DataFrameStatFunctions.sampleBy
 - Table Argument
 - Plotting
 
 - Column
- pyspark.sql.Column.__getattr__
 - pyspark.sql.Column.__getitem__
 - pyspark.sql.Column.alias
 - pyspark.sql.Column.asc
 - pyspark.sql.Column.asc_nulls_first
 - pyspark.sql.Column.asc_nulls_last
 - pyspark.sql.Column.astype
 - pyspark.sql.Column.between
 - pyspark.sql.Column.bitwiseAND
 - pyspark.sql.Column.bitwiseOR
 - pyspark.sql.Column.bitwiseXOR
 - pyspark.sql.Column.cast
 - pyspark.sql.Column.contains
 - pyspark.sql.Column.desc
 - pyspark.sql.Column.desc_nulls_first
 - pyspark.sql.Column.desc_nulls_last
 - pyspark.sql.Column.dropFields
 - pyspark.sql.Column.endswith
 - pyspark.sql.Column.eqNullSafe
 - pyspark.sql.Column.getField
 - pyspark.sql.Column.getItem
 - pyspark.sql.Column.ilike
 - pyspark.sql.Column.isNaN
 - pyspark.sql.Column.isNotNull
 - pyspark.sql.Column.isNull
 - pyspark.sql.Column.isin
 - pyspark.sql.Column.like
 - pyspark.sql.Column.name
 - pyspark.sql.Column.otherwise
 - pyspark.sql.Column.outer
 - pyspark.sql.Column.over
 - pyspark.sql.Column.rlike
 - pyspark.sql.Column.startswith
 - pyspark.sql.Column.substr
 - pyspark.sql.Column.try_cast
 - pyspark.sql.Column.when
 - pyspark.sql.Column.withField
 
 - Data Types
- ArrayType
 - BinaryType
 - BooleanType
 - ByteType
 - DataType
 - DateType
 - DecimalType
 - DoubleType
 - FloatType
 - IntegerType
 - LongType
 - MapType
 - NullType
 - ShortType
 - StringType
 - CharType
 - VarcharType
 - StructField
 - StructType
 - VariantType
 - TimestampType
 - TimestampNTZType
 - DayTimeIntervalType
 - YearMonthIntervalType
 - CalendarIntervalType
 
 - Row
 - Functions
- Normal Functions
 - Conditional Functions
 - Predicate Functions
 - Sort Functions
 - Mathematical Functions
 - String Functions
 - Bitwise Functions
 - Date and Timestamp Functions
 - Hash Functions
 - Collection Functions
 - Array Functions
 - Struct Functions
 - Map Functions
 - Aggregate Functions
 - Window Functions
 - Generator Functions
 - Partition Transformation Functions
 - CSV Functions
 - JSON Functions
 - VARIANT Functions
 - XML Functions
 - URL Functions
 - Misc Functions
 - UDF, UDTF and UDT
 - Table-Valued Functions
 
 - Window
- pyspark.sql.Window.currentRow
 - pyspark.sql.Window.orderBy
 - pyspark.sql.Window.partitionBy
 - pyspark.sql.Window.rangeBetween
 - pyspark.sql.Window.rowsBetween
 - pyspark.sql.Window.unboundedFollowing
 - pyspark.sql.Window.unboundedPreceding
 - pyspark.sql.WindowSpec.orderBy
 - pyspark.sql.WindowSpec.partitionBy
 - pyspark.sql.WindowSpec.rangeBetween
 - pyspark.sql.WindowSpec.rowsBetween
 
 - Grouping
- pyspark.sql.GroupedData.agg
 - pyspark.sql.GroupedData.apply
 - pyspark.sql.GroupedData.applyInArrow
 - pyspark.sql.GroupedData.applyInPandas
 - pyspark.sql.GroupedData.applyInPandasWithState
 - pyspark.sql.GroupedData.avg
 - pyspark.sql.GroupedData.cogroup
 - pyspark.sql.GroupedData.count
 - pyspark.sql.GroupedData.max
 - pyspark.sql.GroupedData.mean
 - pyspark.sql.GroupedData.min
 - pyspark.sql.GroupedData.pivot
 - pyspark.sql.GroupedData.sum
 - pyspark.sql.GroupedData.transformWithStateInPandas
 - pyspark.sql.PandasCogroupedOps.applyInArrow
 - pyspark.sql.PandasCogroupedOps.applyInPandas
 
 - Catalog
- pyspark.sql.Catalog.cacheTable
 - pyspark.sql.Catalog.clearCache
 - pyspark.sql.Catalog.createExternalTable
 - pyspark.sql.Catalog.createTable
 - pyspark.sql.Catalog.currentCatalog
 - pyspark.sql.Catalog.currentDatabase
 - pyspark.sql.Catalog.databaseExists
 - pyspark.sql.Catalog.dropGlobalTempView
 - pyspark.sql.Catalog.dropTempView
 - pyspark.sql.Catalog.functionExists
 - pyspark.sql.Catalog.getDatabase
 - pyspark.sql.Catalog.getFunction
 - pyspark.sql.Catalog.getTable
 - pyspark.sql.Catalog.isCached
 - pyspark.sql.Catalog.listCatalogs
 - pyspark.sql.Catalog.listColumns
 - pyspark.sql.Catalog.listDatabases
 - pyspark.sql.Catalog.listFunctions
 - pyspark.sql.Catalog.listTables
 - pyspark.sql.Catalog.recoverPartitions
 - pyspark.sql.Catalog.refreshByPath
 - pyspark.sql.Catalog.refreshTable
 - pyspark.sql.Catalog.registerFunction
 - pyspark.sql.Catalog.setCurrentCatalog
 - pyspark.sql.Catalog.setCurrentDatabase
 - pyspark.sql.Catalog.tableExists
 - pyspark.sql.Catalog.uncacheTable
 
 - Avro
 - Observation
 - UDF
 - UDTF
 - VariantVal
 - Protobuf
 - Python Data Source
- pyspark.sql.datasource.DataSource.name
 - pyspark.sql.datasource.DataSource.reader
 - pyspark.sql.datasource.DataSource.schema
 - pyspark.sql.datasource.DataSource.streamReader
 - pyspark.sql.datasource.DataSource.writer
 - pyspark.sql.datasource.DataSourceReader.partitions
 - pyspark.sql.datasource.DataSourceReader.read
 - pyspark.sql.datasource.DataSourceRegistration.register
 - pyspark.sql.datasource.DataSourceStreamReader.commit
 - pyspark.sql.datasource.DataSourceStreamReader.initialOffset
 - pyspark.sql.datasource.DataSourceStreamReader.latestOffset
 - pyspark.sql.datasource.DataSourceStreamReader.partitions
 - pyspark.sql.datasource.DataSourceStreamReader.read
 - pyspark.sql.datasource.DataSourceStreamReader.stop
 - pyspark.sql.datasource.DataSourceWriter.abort
 - pyspark.sql.datasource.DataSourceWriter.commit
 - pyspark.sql.datasource.DataSourceWriter.write
 
 - Stateful Processor