Skip to content

SQL

Style Guide * https://github.com/dlstadther/sql-style-guide/

Order of Operations

Operation Description
from source locations to get base data
where filters base data
group by aggregates base data
having filters aggregated data
select returns final data
order by sorts final data
limit reduces final data to a row count

General

Create daily (or DOM) snapshots of change data

Given a dataset which generates at a non-daily rate, create a view which displays the state as of each end-of-month (EOM) date.

Use of generate_series() is specific to postgres and various sql dialects may or may not have similar functionality.

with
-- list of all dates between range
static_dates as (
    SELECT date_trunc('day', dd):: date as the_date
    FROM generate_series
             ( '2022-01-01'::timestamp
             , '2022-08-31'::timestamp
             , '1 day'::interval) dd
)
-- list of EOM dates from "static_dates"
, eom_dates as (
    select cast(the_date - interval '1 day' as date) as the_date from static_dates where extract('day' from the_date) = 1
)
-- fake data to mock a dataset which exists multi-times per month, or missing months
, data as (
    select 'abc123' as client_id, 'x' as row_id, '1' as val, cast('2022-03-01' as date) as exec_date union all -- Mar, Apr
    select 'abc123' as client_id, 'y' as row_id, '2' as val, cast('2022-05-01' as date) as exec_date union all -- none
    select 'abc123' as client_id, 'z' as row_id, '3' as val, cast('2022-05-02' as date) as exec_date union all -- May
    select 'abc123' as client_id, 'a' as row_id, '4' as val, cast('2022-06-30' as date) as exec_date           -- Jun
)
-- create date range for when "data" rows are relevant
, data_range as (
    select
        client_id, row_id, val, exec_date
        , lead(exec_date) over(partition by client_id order by exec_date asc) as next_date
    from
        data
)
-- ensure that all "data_range" rows include a "next_date" (the last record per id will have a null "next_date")
, data_range_notnull as (
    select
        client_id, row_id, val, exec_date,
        case when next_date is null then current_date else next_date end as next_date
    from
        data_range
)
-- join data range to all dates between "exec_date" and "next_date"
-- filter to only EOM dates
select
    *
from
    data_range_notnull as d
left join
    static_dates as sd
on
    d.exec_date <= sd.the_date
    and d.next_date > sd.the_date
where
    sd.the_date in (select the_date from eom_dates)