Mungeol Heo: Tip 4 GCP

BigQuery

Table usage

SELECT

date(creation_time) creation_date,

JOB_ID,

max(total_slot_ms) total_slot_ms,

max(total_bytes_billed) total_bytes_billed,

-- ARRAY_AGG(CONCAT(reference.project_id, '.', reference.dataset_id, '.', reference.table_id)

-- ORDER BY reference.project_id, reference.dataset_id, reference.table_id) AS table_combination,

ARRAY_TO_STRING(

ARRAY_AGG(

CONCAT(

reference.project_id, '.', reference.dataset_id,

'.', reference.table_id

)

ORDER BY

reference.project_id,

reference.dataset_id,

reference.table_id

','

) AS table_combination_str

FROM

`region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT,

UNNEST(referenced_tables) AS reference

WHERE

date(creation_time) between '2024-12-01'

and '2024-12-31'

and JOBS_BY_PROJECT.project_id = 'your project id'

and (

total_bytes_billed > 0

or total_slot_ms > 0

)

and array_length(referenced_tables) > 0

-- and query is not null

-- and parent_job_id is null

GROUP BY

all

WITH ExtractedTables AS (

select

creation_date,

JOB_ID,

total_slot_ms,

total_bytes_billed,

case when array_length(extracted_tables) > 0 then extracted_tables when array_length(extracted_tables_2) > 0 then extracted_tables_2 else [ 'no table extracted' ] end as extracted_tables

from

(

SELECT

date(creation_time) creation_date,

JOB_ID,

total_slot_ms,

total_bytes_billed,

REGEXP_EXTRACT_ALL(

replace(query, '`', ''),

r '((?:[a-zA-Z_-][a-zA-Z0-9_-]+[\.:]){2,3}[a-zA-Z0-9_-]+)'

) extracted_tables,

REGEXP_EXTRACT_ALL(

replace(query, '`', ''),

r '[fromFROM]{4}\s([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)'

) extracted_tables_2

FROM

`region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT

WHERE

date(creation_time) between '2024-12-01'

and '2024-12-31'

and JOBS_BY_PROJECT.project_id = 'your project id'

and (

total_bytes_billed > 0

or total_slot_ms > 0

) -- and array_length(referenced_tables) > 0

and query is not null

and parent_job_id is null

)

SELECT

creation_date,

JOB_ID,

max(total_slot_ms) total_slot_ms,

max(total_bytes_billed) total_bytes_billed,

-- ARRAY_AGG(DISTINCT table_name ORDER BY table_name) AS table_combination,

ARRAY_TO_STRING(

ARRAY_AGG(

DISTINCT table_name

ORDER BY

table_name

','

) AS table_combination_str

FROM

ExtractedTables,

UNNEST(extracted_tables) AS table_name

GROUP BY

all

pivot table
select date, m_name, m_value
from test.kpi_daily
unpivot(m_value for m_name in (b2c_signup_cnt, b2b_signup_cnt))

Format an array (bigQeury)
- format("%T", array_agg(date order by date))

Get the last value (bigQuery)

select array_agg(value order by time desc)[OFFSET(0)] value_last 
from (
select 1 time, 33 value
union all
select 2 time, 22 value
union all
select 3 time, 11 value
)

Unnest multiple arrays (bigQuery)

WITH data AS(
  SELECT 1 n, ['a', 'b'] AS r, [1,2] b, ['a1','b2'] c UNION ALL
  SELECT 2, ['c', 'd', 'e'], [3,4,5], ['c3','d4','e5'] UNION ALL
  select 3, ['f'], [6], ['f6']
)
 
SELECT n, r, b, c
FROM data, UNNEST(r) r WITH OFFSET pos1, unnest(b) b WITH OFFSET pos2, unnest(c) c WITH OFFSET pos3
where pos1=pos2 and pos2=pos3

Create dataset, execute a query and write to a table, export table to CSV in GCS (bigQuery)
- https://github.com/mungeol/training-data-analyst/blob/master/courses/machine_learning/deepdive/05_review/4_preproc.ipynb

Using the BigQuery Storage API to download large results (bigQuery)

https://cloud.google.com/bigquery/docs/pandas-gbq-migration#using_the_to_download_large_results

import pandas

sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`"

# Use the BigQuery Storage API to download results more quickly.
df = pandas.read_gbq(sql, dialect='standard', use_bqstorage_api=True)

Readability (bigQuery)
- format(“%d”, 1000) = 1,000

with tmp as ( select 'a' usn, date('2019-01-01') login_date
union all select 'a', date('2019-01-02') union all select 'a', date('2019-01-04')
union all select 'a', date('2019-01-05') union all select 'a', date('2019-01-06')
union all select 'b', date('2019-01-02') union all select 'b', date('2019-01-03'))

, tmp_user_min_login_date as (select usn, min(login_date) start_login_date from tmp group by usn)

select usn, min(login_date) std_dt, max(login_date) end_dt, count(*) cnt from (
select ta.usn, login_date, row_number() over (order by ta.usn, login_date) + date_diff(start_login_date, login_date, day) num
from tmp ta inner join tmp_user_min_login_date tb on ta.usn=tb.usn
)
group by usn, num
having cnt > 1
order by usn, num desc

Column number of a table (bigQuery)
- select array_length(regexp_extract_all(to_json_string(`netmarble-gameservice-ai.rmt_stonemmotw_ml.feature_20190517`),"\":"))total_columns
  from `netmarble-gameservice-ai.rmt_stonemmotw_ml.feature_20190517` limit 1

Mungeol Heo

Monday, May 11, 2020

Tip 4 GCP

No comments:

Post a Comment