Sunday, March 21, 2021

Crosstabbing Data in Postgres with an Indefinite Number of Columns

PostgreSQL's tablefunc extension provides the crosstab function, which can be used to transform normalized data into a crosstabbed format.  However, one of the drawbacks of the crosstab function is that it requires that the names of all of the column headers be listed in one of its arguments.  In some cases (actually, pretty much every case when I have reason to use crosstab), the number of distinct values that are to be transformed into column headers is not fixed.  Finding those and manually creating the appropriate crosstab argument is tedious and error-prone.

To eliminate this step and to substantially automate the task of crosstabbing data in Postgres, I wrote the following script for execsql.  This uses Postgres' string_agg() aggregate function, Postgres' quote_ident() function, and an execsql substitution variable to dynamically create the list of column headers that is to be used as an argument to crosstab.

This script should be run using execsql's execute script metacommand.

 

-- ########################################################################
-- !x! BEGIN SCRIPT PG_XTAB with parameters (dtable, rowid, colhdr, value, value_type, xtable)
--        Purpose:
--            Crosstab any three columns from a data table.
--        Parameters:
--            dtable        : The name of the input data table.
--            rowid        : The name of the column that should be kept as a row identifier.
--            colhdr        : The name of the colulmn containing values that will
--                            be used as column headers in the output.
--            value        : The name of the column containing values that will
--                            appear in each cell of the cross-table.
--            value_type    : The Postgres data type of the values.  Types with spaces
--                            should be double-quoted (e.g., "double precision").
--            xtable        : The name of the temporary table that is to be created.
--            col_order    : Optional: the name of a column in the input table
--                            with values that have a 1:1 relatiionship with
--                            values of 'colhdr' and that should be used to order
--                            the crosstabbed columns in the output.
--        Notes:
--            1. There must be only one 'value' for each combination of 'rowid'
--                and 'colhdr'.  This constraint will be checked, and if it fails,
--                the failing values will be displayed if a global 'verbose' variable
--                is set, and you will be given an option to to continue or halt.
--                If it fails and 'verbose' is not set, the script will halt.
--            2. The output table 'xtable' is deleted at the beginning of the script,
--                so it will not exist if you choose to continue after an error.
-- =========================================================================

-- Drop the table for return values so that it doesn't exist if this exits on error.
drop table if exists !!#xtable!! cascade;

-- ------------------------------------------------------------------------
--        QA Check: Unique combination of row, column, and value.
-- ------------------------------------------------------------------------
drop view if exists pgx_ck1 cascade;
create temporary view pgx_ck1 as
select distinct !!#rowid!!, !!#colhdr!!, !!#value!!
from !!#dtable!!;
drop view if exists pgx_ck2 cascade;
create temporary view pgx_ck2 as
select !!#rowid!!, !!#colhdr!!
from pgx_ck1
group by !!#rowid!!, !!#colhdr!!
having count(*) > 1;
-- !x! if(hasrows(pgx_ck2))
    -- !x! if(sub_defined(verbose) and !!verbose!!)
        -- !x! prompt ask "There are multiple results per cell in the input to PG_XTAB, so the cross-table will not be created.  Do you want to continue anyway?" sub pgx_continue compare pgx_ck1 and pgx_ck2 key (!!#rowid!!, !!#colhdr!!, !!#value!!)
        -- !x! if(not !!pgx_continue!!)
            -- !x! HALT message "There are multiple results per cell in the input to the PG_XTAB script." display pgx_ck1
        -- !x! endif
    -- !x! else
        -- !x! HALT message "There are multiple results per cell in the input to the PG_XTAB script."
    -- !x! endif
-- !x! endif

-- ------------------------------------------------------------------------
--        QA Check: Only one column order value for each column header.
-- ------------------------------------------------------------------------
-- !x! if(sub_defined(#col_order))
    drop view if exists pgx_ck3 cascade;
    create temporary view pgx_ck3 as
    select !!#colhdr!!, count(*) as ordering_values
    from (select distinct !!#colhdr!!, !!#col_order!! from !!#dtable!!) as vs
    group by !!#colhdr!!
    having count(*) > 1;
    -- !x! if(hasrows(pgx_ck3))
        -- !x! HALT "There are multiple ordering values for each column header value in the input to PG_XTAB."
    -- !x! endif
-- !x! endif


-- ------------------------------------------------------------------------
--        Get the list of column headers
-- ------------------------------------------------------------------------
drop view if exists pgx_hdrs cascade;
-- !x! if(sub_defined(#col_order))
    -- !x! sub orderclause order by !!#col_order!!
    -- !x! sub ordercol , !!#col_order!!
-- !x! else
    -- !x! sub_empty orderclause
    -- !x! sub_empty ordercol
-- !x! endif
create temporary view pgx_hdrs as
select hdr
from (
    select distinct quote_ident(!!#colhdr!!) as hdr !!ordercol!!
    from !!#dtable!!
    !!orderclause!!
    ) as hd
!!orderclause!!;

-- ------------------------------------------------------------------------
--        Get the three columns of data in a form to be crosstabbed.
-- ------------------------------------------------------------------------
drop view if exists pgx_xinp cascade;
create temporary view pgx_xinp as
select
    !!#rowid!!,
    quote_ident(!!#colhdr!!) as hdr,
    !!#value!! as value
from
    !!#dtable!!
order by
    !!#rowid!!;

-- ------------------------------------------------------------------------
--        Get the type declarations for the crosstabbed columns.
-- ------------------------------------------------------------------------
drop view if exists pgx_type_decl cascade;
create temporary view pgx_type_decl as
select string_agg(hdr || ' !!#value_type!!', ', ') as coltypes
from pgx_hdrs;
-- !x! subdata ~coltypes pgx_type_decl

-- ------------------------------------------------------------------------
--        Create the crosstab.
-- ------------------------------------------------------------------------
create temporary table !!#xtable!! as
select *
from
    crosstab('select * from pgx_xinp', 'select * from pgx_hdrs')
        as ct(!!#rowid!! text, !!~coltypes!!);


-- !x! END SCRIPT PG_XTAB
-- ########################################################################

No comments:

Post a Comment