Monday, May 11, 2020

Tip 4 GCP

BigQuery
  • Table usage
SELECT 
  date(creation_time) creation_date, 
  JOB_ID, 
  max(total_slot_ms) total_slot_ms, 
  max(total_bytes_billed) total_bytes_billed, 
  -- ARRAY_AGG(CONCAT(reference.project_id, '.', reference.dataset_id, '.', reference.table_id) 
  -- ORDER BY reference.project_id, reference.dataset_id, reference.table_id) AS table_combination,
  ARRAY_TO_STRING(
    ARRAY_AGG(
      CONCAT(
        reference.project_id, '.', reference.dataset_id, 
        '.', reference.table_id
      ) 
      ORDER BY 
        reference.project_id, 
        reference.dataset_id, 
        reference.table_id
    ), 
    ','
  ) AS table_combination_str 
FROM 
  `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT, 
  UNNEST(referenced_tables) AS reference 
WHERE 
  date(creation_time) between '2024-12-01' 
  and '2024-12-31' 
  and JOBS_BY_PROJECT.project_id = 'your project id' 
  and (
    total_bytes_billed > 0 
    or total_slot_ms > 0
  ) 
  and array_length(referenced_tables) > 0 
  -- and query is not null 
  -- and parent_job_id is null 
GROUP BY 
  all


WITH ExtractedTables AS (
  select 
    creation_date, 
    JOB_ID, 
    total_slot_ms, 
    total_bytes_billed, 
    case when array_length(extracted_tables) > 0 then extracted_tables when array_length(extracted_tables_2) > 0 then extracted_tables_2 else [ 'no table extracted' ] end as extracted_tables 
  from 
    (
      SELECT 
        date(creation_time) creation_date, 
        JOB_ID, 
        total_slot_ms, 
        total_bytes_billed, 
        REGEXP_EXTRACT_ALL(
          replace(query, '`', ''), 
          r '((?:[a-zA-Z_-][a-zA-Z0-9_-]+[\.:]){2,3}[a-zA-Z0-9_-]+)'
        ) extracted_tables, 
        REGEXP_EXTRACT_ALL(
          replace(query, '`', ''), 
          r '[fromFROM]{4}\s([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)'
        ) extracted_tables_2 
      FROM 
        `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT 
      WHERE 
        date(creation_time) between '2024-12-01' 
        and '2024-12-31' 
        and JOBS_BY_PROJECT.project_id = 'your project id' 
        and (
          total_bytes_billed > 0 
          or total_slot_ms > 0
        ) -- and array_length(referenced_tables) > 0 
        and query is not null 
        and parent_job_id is null
    )
SELECT 
  creation_date, 
  JOB_ID, 
  max(total_slot_ms) total_slot_ms, 
  max(total_bytes_billed) total_bytes_billed, 
  -- ARRAY_AGG(DISTINCT table_name ORDER BY table_name) AS table_combination, 
  ARRAY_TO_STRING(
    ARRAY_AGG(
      DISTINCT table_name 
      ORDER BY 
        table_name
    ), 
    ','
  ) AS table_combination_str 
FROM 
  ExtractedTables, 
  UNNEST(extracted_tables) AS table_name 
GROUP BY 
  all

  • pivot table
  • select date, m_name, m_value
    from test.kpi_daily
    unpivot(m_value for m_name in (b2c_signup_cnt, b2b_signup_cnt))

  • Format an array (bigQeury)
    • format("%T", array_agg(date order by date))
  • Get the last value (bigQuery)

    select array_agg(value order by time desc)[OFFSET(0)] value_last 
    from (
    select 1 time, 33 value
    union all
    select 2 time, 22 value
    union all
    select 3 time, 11 value
    )


  • Unnest multiple arrays (bigQuery)

    WITH data AS(
      SELECT 1 n, ['a', 'b'] AS r, [1,2] b, ['a1','b2'] c UNION ALL
      SELECT 2, ['c', 'd', 'e'], [3,4,5], ['c3','d4','e5'] UNION ALL
      select 3, ['f'], [6], ['f6']
    )
     
    SELECT n, r, b, c
    FROM data, UNNEST(r) r WITH OFFSET pos1, unnest(b) b WITH OFFSET pos2, unnest(c) c WITH OFFSET pos3
    where pos1=pos2 and pos2=pos3


  • Create dataset, execute a query and write to a table, export table to CSV in GCS (bigQuery)

  • Using the BigQuery Storage API to download large results (bigQuery)

  • Readability (bigQuery)

    • format(“%d”, 1000) = 1,000
  • Login streak (bigQuery)


    with tmp as select 'a' usn, date('2019-01-01') login_date
    union all select 'a'date('2019-01-02'union all select 'a'date('2019-01-04')
    union all select 'a'date('2019-01-05'union all select 'a'date('2019-01-06')
    union all select 'b'date('2019-01-02'union all select 'b'date('2019-01-03'))

    , tmp_user_min_login_date as (select usn, min(login_date) start_login_date from tmp group by usn)

    select usn, min(login_date) std_dt, max(login_date) end_dt, count(*) cnt from (
    select ta.usn, login_date, row_number() over (order by ta.usn, login_date) + date_diff(start_login_date, login_date, day) num
    from tmp ta inner join tmp_user_min_login_date tb on ta.usn=tb.usn
    )
    group by usn, num
    having cnt > 1
    order by usn, num desc


  • Column number of a table (bigQuery)

    • select array_length(regexp_extract_all(to_json_string(`netmarble-gameservice-ai.rmt_stonemmotw_ml.feature_20190517`),"\":"))total_columns
      from `netmarble-gameservice-ai.rmt_stonemmotw_ml.feature_20190517` limit 1

No comments:

Post a Comment

Note: Only a member of this blog may post a comment.