You are on page 1of 19

Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

1 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

root@redshift/dev=# SELECT l_orderkey, COUNT(*)

2 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

FROM lineitem
GROUP BY 1
ORDER BY 2 DESC
LIMIT 100;
l_orderkey | count
------------+----------
[NULL] | 124993010
260642439 | 80
240404513 | 80
56095490 | 72
348088964 | 72
466727011 | 72
438870661 | 72
...
...

-- Materialize a single column to check distribution


CREATE TEMP TABLE lineitem_dk_l_partkey DISTKEY (l_partkey) AS
SELECT l_partkey FROM lineitem;

-- Identify the table OID


root@redshift/tpch=# SELECT 'lineitem_dk_l_partkey'::regclass::oid;
oid
--------
240791
(1 row)

3 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT "table" tablename, skew_rows,


ROUND(CAST(max_blocks_per_slice AS FLOAT) /
GREATEST(NVL(min_blocks_per_slice,0)::int,1)::FLOAT,5) storage_skew,
ROUND(CAST(100*dist_slice AS FLOAT) /
(SELECT COUNT(DISTINCT slice) FROM stv_slices),2) pct_populated
FROM svv_table_info ti
JOIN (SELECT tbl, MIN(c) min_blocks_per_slice,
MAX(c) max_blocks_per_slice,
COUNT(DISTINCT slice) dist_slice
FROM (SELECT b.tbl, b.slice, COUNT(*) AS c
FROM STV_BLOCKLIST b
GROUP BY b.tbl, b.slice)
WHERE tbl = 240791 GROUP BY tbl) iq ON iq.tbl = ti.table_id;
tablename | skew_rows | storage_skew | pct_populated
-----------------------+-----------+--------------+---------------
lineitem_dk_l_partkey | 1.00 | 1.00259 | 100
(1 row)

CREATE TABLE orders (


o_orderkey int8 NOT NULL ,
o_custkey int8 NOT NULL ,
o_orderstatus char(1) NOT NULL ,
o_totalprice numeric(12,2) NOT NULL ,
o_orderdate date NOT NULL DISTKEY ,
o_orderpriority char(15) NOT NULL ,
o_clerk char(15) NOT NULL ,
o_shippriority int4 NOT NULL ,
o_comment varchar(79) NOT NULL
);

4 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

root@redshift/tpch=# SELECT o_orderdate, count(*)


FROM orders GROUP BY 1 ORDER BY 2 DESC;
o_orderdate | count
-------------+---------
1993-01-18 | 2651712
1993-08-29 | 2646252
1993-12-05 | 2644488
1993-12-04 | 2642598
...
...
1993-09-28 | 2593332
1993-12-12 | 2593164
1993-11-14 | 2593164
1993-12-07 | 2592324
(365 rows)

-- How many values are assigned to each slice


root@redshift/tpch=# SELECT rows/2592324 assigned_values, COUNT(*) number_of_slices FROM stv_tbl_p
GROUP BY 1 ORDER BY 1;
assigned_values | number_of_slices
-----------------+------------------
0 | 307
1 | 192
2 | 61
3 | 13
4 | 3
(5 rows)

5 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

pct_slices_populated

CREATE TABLE orders (


o_orderkey int8 NOT NULL ,
o_custkey int8 NOT NULL ,
o_orderstatus char(1) NOT NULL ,
o_totalprice numeric(12,2) NOT NULL ,
o_orderdate date NOT NULL DISTKEY ,
o_orderpriority char(15) NOT NULL ,
o_clerk char(15) NOT NULL ,
o_shippriority int4 NOT NULL ,
o_comment varchar(79) NOT NULL
);

6 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT ... FROM orders


JOIN ...
JOIN ...
WHERE ...
AND o_orderdate between current_date-7 and current_date-1
GROUP BY ...;

root@redshift/tpch=# SELECT SLICE_NUM(), COUNT(*) FROM orders


WHERE o_orderdate BETWEEN current_date-7 AND current_date-1
GROUP BY 1 ORDER BY 1;
slice_num | count
-----------+---------
3 | 2553840
33 | 2553892
40 | 2555232
41 | 2553092
54 | 2554296
74 | 2552168
76 | 2552224
(7 rows)

SELECT
ti."table", ti.diststyle, RTRIM(a.attname) column_name,
COUNT(DISTINCT s.query ||'-'|| s.segment ||'-'|| s.step) as num_scans,

7 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

COUNT(DISTINCT CASE WHEN TRANSLATE(TRANSLATE(info,')',' '),'(',' ') LIKE ('%'|| a.attname ||'%
FROM stl_explain p
JOIN stl_plan_info i ON ( i.userid=p.userid AND i.query=p.query AND i.nodeid=p.nodeid )
JOIN stl_scan s ON (s.userid=i.userid AND s.query=i.query AND s.segment=i.segment AND s.step=i.ste
JOIN svv_table_info ti ON ti.table_id=s.tbl
JOIN pg_attribute a ON (a.attrelid=s.tbl AND a.attnum > 0)
WHERE s.tbl IN ([table_id])
GROUP BY 1,2,3,a.attnum
ORDER BY attnum;

SELECT
ti.schemaname||'.'||ti.tablename AS "table",
ti.tbl_rows,
AVG(r.s_rows_pre_filter) avg_s_rows_pre_filter,
100*ROUND(1::float - AVG(r.s_rows_pre_filter)::float/ti.tbl_rows::float,6) avg_prune_pct,
AVG(r.s_rows) avg_s_rows,
100*ROUND(1::float - AVG(r.s_rows)::float/AVG(r.s_rows_pre_filter)::float,6) avg_filter_pct,
COUNT(DISTINCT i.query) AS num,
AVG(r.time) AS scan_time,
MAX(i.query) AS query, TRIM(info) as filter
FROM stl_explain p
JOIN stl_plan_info i ON ( i.userid=p.userid AND i.query=p.query AND i.nodeid=p.nodeid )
JOIN stl_scan s ON (s.userid=i.userid AND s.query=i.query AND s.segment=i.segment AND s.step=i.ste
JOIN (SELECT table_id,"table" tablename,schema schemaname,tbl_rows,unsorted,sortkey1,sortkey_num,d
JOIN (
SELECT query, segment, step, DATEDIFF(s,MIN(starttime),MAX(endtime)) AS time, SUM(rows) s_rows, SU
FROM stl_scan
WHERE userid>1 AND type=2
AND starttime < endtime

tbl_rows
avg_s_rows_pre_filter

avg_prune_pct
avg_s_rows
avg_filter_pct avg_s_rows_pre_filter

8 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

num:
scan_time
query
filter

table | public.orders
tbl_rows | 22751520
avg_s_rows_pre_filter | 12581124
avg_prune_pct | 44.7021
avg_s_rows | 5736106
avg_filter_pct | 54.407
num | 2
scan_time | 19
query | 1721037
filter | Filter: ((o_orderdate < '1993-08-01'::date) AND (o_orderdate >= '1993-05-0

SELECT COUNT(DISTINCT o_orderdate)


FROM public.orders
WHERE o_orderdate < '1993-08-01' AND o_orderdate >= '1993-05-01';

tbl_rows
avg_s_rows
avg_s_rows_pre_filter

scan_time

9 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT attname FROM pg_attribute


WHERE attrelid = [table_id] AND attsortkeyord = 1;

tbl_rows avg_s_rows_pre_filter
avg_s_rows_pre_filter avg_s_rows

scan_time

10 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT COUNT(*) num_queries FROM stl_query


WHERE query IN (
SELECT DISTINCT query FROM stl_scan
WHERE tbl = [table_id] AND type = 2 AND userid > 1
INTERSECT
SELECT DISTINCT query FROM stl_scan
WHERE tbl <> [table_id] AND type = 2 AND userid > 1
AND tbl IN (
SELECT DISTINCT attrelid FROM pg_attribute
WHERE attisdistkey = true AND attsortkeyord > 0
MINUS
SELECT DISTINCT attrelid FROM pg_attribute
WHERE attsortkeyord = -1)
INTERSECT
(SELECT DISTINCT query FROM stl_hashjoin WHERE userid > 1
UNION
SELECT DISTINCT query FROM stl_nestloop WHERE userid > 1
UNION
SELECT DISTINCT query FROM stl_mergejoin WHERE userid > 1)

11 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

12 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT COUNT(*) FROM (


SELECT DISTINCT query FROM stl_scan
WHERE tbl = [table_id] AND type = 2 AND userid > 1
INTERSECT
(SELECT DISTINCT query FROM stl_hashjoin
UNION
SELECT DISTINCT query FROM stl_nestloop
UNION
SELECT DISTINCT query FROM stl_mergejoin));

SELECT userid, query, starttime, endtime, rtrim(querytxt) qtxt


FROM stl_query WHERE query IN (
SELECT DISTINCT query FROM stl_scan
WHERE tbl = [table_id] AND type = 2 AND userid > 1
INTERSECT
(SELECT DISTINCT query FROM stl_hashjoin
UNION
SELECT DISTINCT query FROM stl_nestloop
UNION
SELECT DISTINCT query FROM stl_mergejoin))
ORDER BY starttime;

13 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT table_id, "table", diststyle, size, pct_used


FROM svv_table_info WHERE table_id = [table_id];

root@redshift/tpch=# SELECT "table", diststyle, size, pct_used


FROM svv_table_info
WHERE "table" LIKE 'orders_diststyle_%';
table | diststyle | size | pct_used
-----------------------+-----------------+-------+----------
orders_diststyle_even | EVEN | 6740 | 1.1785
orders_diststyle_key | KEY(o_orderkey) | 6740 | 1.1785
orders_diststyle_all | ALL | 19983 | 3.4941
(3 rows)

SELECT "table", size, pct_used,


CASE diststyle

14 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

WHEN 'ALL' THEN size::TEXT


ELSE '< ' || size*(SELECT COUNT(DISTINCT node) FROM stv_slices)
END est_distall_size,
CASE diststyle
WHEN 'ALL' THEN pct_used::TEXT
ELSE '< ' || pct_used*(SELECT COUNT(DISTINCT node) FROM stv_slices)
END est_distall_pct_used
FROM svv_table_info WHERE table_id = [table_id];

SELECT '[table_id]' AS "table_id",


(SELECT count(*) FROM
(SELECT DISTINCT query FROM stl_insert WHERE tbl = [table_id]
INTERSECT
SELECT DISTINCT query FROM stl_delete WHERE tbl = [table_id])) AS num_updates,
(SELECT count(*) FROM
(SELECT DISTINCT query FROM stl_delete WHERE tbl = [table_id]
MINUS

15 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

SELECT DISTINCT query FROM stl_insert WHERE tbl = [table_id])) AS num_deletes,


(SELECT COUNT(*) FROM
(SELECT DISTINCT query FROM stl_insert WHERE tbl = [table_id]
MINUS
SELECT distinct query FROM stl_s3client
MINUS
SELECT DISTINCT query FROM stl_delete WHERE tbl = [table_id])) AS num_inserts,
(SELECT COUNT(*) FROM
(SELECT DISTINCT query FROM stl_insert WHERE tbl = [table_id]
INTERSECT
SELECT distinct query FROM stl_s3client)) as num_copies,

16 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

17 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

18 of 19 3/7/2022, 8:49 PM
Amazon Redshift Engineering’s Advanced Table Design Playbook: Distr... https://aws.amazon.com/pt/blogs/big-data/amazon-redshift-engineerings...

19 of 19 3/7/2022, 8:49 PM

You might also like