You are on page 1of 2

DELTA LAKE DDL/DML: UPDATE, DELETE, INSERT, ALTER TABLE TIME TRAVEL (CONTINUED)

Update rows that match a predicate condition Rollback a table to an earlier version
WITH SPARK SQL UPDATE tableName SET event = 'click' WHERE event = 'clk' -- RESTORE requires Delta Lake version 0.7.0+ & DBR 7.4+.
RESTORE tableName VERSION AS OF 0
Delete rows that match a predicate condition RESTORE tableName TIMESTAMP AS OF "2020-12-18"
Delta Lake is an open source storage layer that brings ACID DELETE FROM tableName WHERE "date < '2017-01-01"
transactions to Apache Spark™ and big data workloads. Insert values directly into table
INSERT INTO TABLE tableName VALUES ( UTILITY METHODS
delta.io | Documentation | GitHub | Delta Lake on Databricks (8003, "Kim Jones", "2020-12-18", 3.875),
(8004, "Tim Jones", "2020-12-20", 3.750) View table details
);
CREATE AND QUERY DELTA TABLES -- Insert using SELECT statement
DESCRIBE DETAIL tableName
DESCRIBE FORMATTED tableName
INSERT INTO tableName SELECT * FROM sourceTable
Create and use managed database -- Atomically replace all data in table with new values Delete old files with Vacuum
-- Managed database is saved in the Hive metastore. INSERT OVERWRITE loan_by_state_delta VALUES (...)
VACUUM tableName [RETAIN num HOURS] [DRY RUN]
Default database is named "default".
DROP DATABASE IF EXISTS dbName;
Upsert (update + insert) using MERGE Clone a Delta Lake table
CREATE DATABASE dbName; MERGE INTO target -- Deep clones copy data from source, shallow clones don't.
USE dbName -- This command avoids having to specify USING updates CREATE TABLE [dbName.] targetName
dbName.tableName every time instead of just tableName. ON target.Id = updates.Id [SHALLOW | DEEP] CLONE sourceName [VERSION AS OF 0]
WHEN MATCHED AND target.delete_flag = "true" THEN [LOCATION "path/to/table"]
Query Delta Lake table by table name (preferred) DELETE -- specify location only for path-based tables
WHEN MATCHED THEN
/* You can refer to Delta Tables by table name, or by
UPDATE SET * -- star notation means all columns
path. Table name is the preferred way, since named tables
WHEN NOT MATCHED THEN Interoperability with Python / DataFrames
are managed in the Hive Metastore (i.e., when you DROP a
INSERT (date, Id, data) -- or, use INSERT * -- Read name-based table from Hive metastore into DataFrame
named table, the data is dropped also — not the case for
VALUES (date, Id, data) df = spark.table("tableName")
path-based tables.) */
-- Read path-based table into DataFrame
SELECT * FROM [dbName.] tableName
Insert with Deduplication using MERGE df = spark.read.format("delta").load("/path/to/delta_table")
Query Delta Lake table by path MERGE INTO logs Run SQL queries from Python
SELECT * FROM delta.`path/to/delta_table` -- note USING newDedupedLogs
spark.sql("SELECT * FROM tableName")
backticks ON logs.uniqueId = newDedupedLogs.uniqueId
spark.sql("SELECT * FROM delta.`/path/to/delta_table`")
WHEN NOT MATCHED
Convert Parquet table to Delta Lake format in place THEN INSERT * Modify data retention settings for Delta Lake table
-- logRetentionDuration -> how long transaction log history
-- by table name Alter table schema — add columns
CONVERT TO DELTA [dbName.]tableName is kept, deletedFileRetentionDuration -> how long ago a file
[PARTITIONED BY (col_name1 col_type1, col_name2 ALTER TABLE tableName ADD COLUMNS ( must have been deleted before being a candidate for VACCUM.
col_type2)] col_name data_type ALTER TABLE tableName
[FIRST|AFTER colA_name]) SET TBLPROPERTIES(
-- path-based tables delta.logRetentionDuration = "interval 30 days",
CONVERT TO DELTA parquet.`/path/to/table` -- note backticks Alter table — add constraint delta.deletedFileRetentionDuration = "interval 7 days"
[PARTITIONED BY (col_name1 col_type1, col_name2 col_type2)] -- Add "Not null" constraint: );
ALTER TABLE tableName CHANGE COLUMN col_name SET NOT NULL SHOW TBLPROPERTIES tableName;
Create Delta Lake table as SELECT * with no upfront -- Add "Check" constraint:
schema definition ALTER TABLE tableName
ADD CONSTRAINT dateWithinRange CHECK date > "1900-01-01"
CREATE TABLE [dbName.] tableName -- Drop constraint: PERFORMANCE OPTIMIZATIONS
USING DELTA ALTER TABLE tableName DROP CONSTRAINT dateWithinRange
AS SELECT * FROM tableName | parquet.`path/to/data` Compact data files with Optimize and Z-Order
[LOCATION `/path/to/table`] *Databricks Delta Lake feature
-- using location = unmanaged table OPTIMIZE tableName

Create table, define schema explicitly with SQL DDL


TIME TRAVEL [ZORDER BY (colNameA, colNameB)]

CREATE TABLE [dbName.] tableName ( View transaction log (aka Delta Log) Auto-optimize tables
id INT [NOT NULL], *Databricks Delta Lake feature
DESCRIBE HISTORY tableName
name STRING, ALTER TABLE [table_name | delta.`path/to/delta_table`]
SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true)
date DATE,
Query historical versions of Delta Lake tables
int_rate FLOAT)
USING DELTA SELECT * FROM tableName VERSION AS OF 0 Cache frequently queried data in Delta Cache
[PARTITIONED BY (time, date)] -- optional SELECT * FROM tableName@v0 -- equivalent to VERSION AS OF 0 *Databricks Delta Lake feature
SELECT * FROM tableName TIMESTAMP AS OF "2020-12-18" CACHE SELECT * FROM tableName
Copy new data into Delta Lake table (with idempotent retries) -- or:
Find changes between 2 versions of table CACHE SELECT colA, colB FROM tableName WHERE colNameA > 0
COPY INTO [dbName.] targetTable
FROM (SELECT * FROM "/path/to/table") SELECT * FROM tableName VERSION AS OF 12
FILEFORMAT = DELTA -- or CSV, Parquet, ORC, JSON, etc. EXCEPT ALL SELECT * FROM tableName VERSION AS OF 11
WORKING WITH DELTA
DELTATABLES
TABLES TIME TRAVEL (CONTINUED)

# A DeltaTable is the entry point for interacting with Find changes between 2 versions of a table
WITH PYTHON tables programmatically in Python — for example, to
df1 = spark.read.format("delta").load(pathToTable)
perform updates or deletes.
df2 = spark.read.format("delta").option("versionAsOf",
from delta.tables import *
2).load("/path/to/delta_table")
Delta Lake is an open source storage layer that brings ACID df1.exceptAll(df2).show()
deltaTable = DeltaTable.forName(spark, tableName)
transactions to Apache Spark™ and big data workloads. deltaTable = DeltaTable.forPath(spark, Rollback a table by version or timestamp
delta.`path/to/table`)
delta.io | Documentation | GitHub | API reference | Databricks deltaTable.restoreToVersion(0)
deltaTable.restoreToTimestamp('2020-12-01')

READS AND WRITES WITH DELTA LAKE DELTA LAKE DDL/DML: UPDATES, DELETES, INSERTS, MERGES

Read data from pandas DataFrame Delete rows that match a predicate condition
UTILITY METHODS
df = spark.createDataFrame(pdf) # predicate using SQL formatted string Run Spark SQL queries in Python
# where pdf is a pandas DF deltaTable.delete("date < '2017-01-01'")
# then save DataFrame in Delta Lake format as shown below # predicate using Spark SQL functions spark.sql("SELECT * FROM tableName")
deltaTable.delete(col("date") < "2017-01-01") spark.sql("SELECT * FROM delta.`/path/to/delta_table`")
Read data using Apache Spark™ spark.sql("DESCRIBE HISTORY tableName")
# read by path Update rows that match a predicate condition
df = (spark.read.format("parquet"|"csv"|"json"|etc.) # predicate using SQL formatted string
Compact old files with Vacuum
.load("/path/to/delta_table")) deltaTable.update(condition = "eventType = 'clk'", deltaTable.vacuum() # vacuum files older than default
# read table from Hive metastore set = { "eventType": "'click'" } ) retention period (7 days)
df = spark.table("events") # predicate using Spark SQL functions deltaTable.vacuum(100) # vacuum files not required by
deltaTable.update(condition = col("eventType") == "clk", versions more than 100 hours old
Save DataFrame in Delta Lake format set = { "eventType": lit("click") } )
Clone a Delta Lake table
(df.write.format("delta")
.mode("append"|"overwrite")
Upsert (update + insert) using MERGE deltaTable.clone(target="/path/to/delta_table/",
.partitionBy("date") # optional # Available options for merges [see documentation for isShallow=True, replace=True)
.option("mergeSchema", "true") # option - evolve schema details]:
.saveAsTable("events") | .save("/path/to/delta_table") .whenMatchedUpdate(...) | .whenMatchedUpdateAll(...) | Get DataFrame representation of a Delta Lake table
) .whenNotMatchedInsert(...) | .whenMatchedDelete(...) df = deltaTable.toDF()
(deltaTable.alias("target").merge(
Streaming reads (Delta table as streaming source) source = updatesDF.alias("updates"), Run SQL queries on Delta Lake tables
# by path or by table name condition = "target.eventId = updates.eventId") spark.sql("SELECT * FROM tableName")
df = (spark.readStream .whenMatchedUpdateAll() spark.sql("SELECT * FROM delta.`/path/to/delta_table`")
.format("delta") .whenNotMatchedInsert(
.schema(schema) values = {
.table("events") | .load("/delta/events") "date": "updates.date",
) "eventId": "updates.eventId", PERFORMANCE OPTIMIZATIONS
"data": "updates.data",
Streaming writes (Delta table as a sink) "count": 1
Compact data files with Optimize and Z-Order
}
streamingQuery = ( ).execute() *Databricks Delta Lake feature
df.writeStream.format("delta") ) spark.sql("OPTIMIZE tableName [ZORDER BY (colA, colB)]")
.outputMode("append"|"update"|"complete")
.option("checkpointLocation", "/path/to/checkpoints") Insert with Deduplication using MERGE Auto-optimize tables
.trigger(once=True|processingTime="10 seconds") (deltaTable.alias("logs").merge( *Databricks Delta Lake feature. For existing tables:
.table("events") | .start("/delta/events") newDedupedLogs.alias("newDedupedLogs"), spark.sql("ALTER TABLE [table_name |
) "logs.uniqueId = newDedupedLogs.uniqueId") delta.`path/to/delta_table`]
.whenNotMatchedInsertAll() SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true)
.execute() To enable auto-optimize for all new Delta Lake tables:
) spark.sql("SET spark.databricks.delta.properties.
CONVERT PARQUET TO DELTA LAKE defaults.autoOptimize.optimizeWrite = true")
TIME TRAVEL
Convert Parquet table to Delta Lake format in place Cache frequently queried data in Delta Cache
View transaction log (aka Delta Log) *Databricks Delta Lake feature
deltaTable = DeltaTable.convertToDelta(spark,
"parquet.`/path/to/parquet_table`") fullHistoryDF = deltaTable.history() spark.sql("CACHE SELECT * FROM tableName")
-- or:
partitionedDeltaTable = DeltaTable.convertToDelta(spark, Query historical versions of Delta Lake tables spark.sql("CACHE SELECT colA, colB FROM tableName
"parquet.`/path/to/parquet_table`", "part int") WHERE colNameA > 0")
# choose only one option: versionAsOf, or timestampAsOf
df = (spark.read.format("delta")
.option("versionAsOf", 0)
.option("timestampAsOf", "2020-12-18")
.load("/path/to/delta_table"))

You might also like