pg_total_relation_size: query: | SELECT relnamespace::regnamespace as schemaname, relname as relname, pg_total_relation_size(oid) bytes FROM pg_class WHERE relkind = 'r'; metrics: - schemaname: usage: "LABEL" description: "Name of the schema that this table is in" - relname: usage: "LABEL" description: "Name of this table" - bytes: usage: "GAUGE" description: "total disk space usage for the specified table and associated indexes" pg_blocked: query: | SELECT count(blocked.transactionid) AS queries, '__transaction__' AS table FROM pg_catalog.pg_locks blocked WHERE NOT blocked.granted AND locktype = 'transactionid' GROUP BY locktype UNION SELECT count(blocked.relation) AS queries, blocked.relation::regclass::text AS table FROM pg_catalog.pg_locks blocked WHERE NOT blocked.granted AND locktype != 'transactionid' GROUP BY relation metrics: - queries: usage: "GAUGE" description: "The current number of blocked queries" - table: usage: "LABEL" description: "The table on which a query is blocked" pg_oldest_blocked: query: | SELECT coalesce(extract('epoch' from max(clock_timestamp() - state_change)), 0) age_seconds FROM pg_stat_activity WHERE wait_event_type = 'Lock' AND state='active' metrics: - age_seconds: usage: "GAUGE" description: "Largest number of seconds any transaction is currently waiting on a lock" pg_slow: query: | SELECT COUNT(*) AS queries FROM pg_stat_activity WHERE state = 'active' AND (now() - query_start) > '1 seconds'::interval metrics: - queries: usage: "GAUGE" description: "Current number of slow queries" pg_long_running_transactions: query: | SELECT COUNT(*) as transactions, coalesce(MAX(EXTRACT(EPOCH FROM (clock_timestamp() - xact_start))), 0) AS age_in_seconds FROM pg_stat_activity WHERE state is distinct from 'idle' AND (now() - xact_start) > '1 minutes'::interval AND query not like 'autovacuum:%' metrics: - transactions: usage: "GAUGE" description: "Current number of long running transactions" - age_in_seconds: usage: "GAUGE" description: "The current maximum transaction age in seconds" pg_stuck_idle_in_transaction: query: | SELECT COUNT(*) AS queries FROM pg_stat_activity WHERE state = 'idle in transaction' AND (now() - query_start) > '10 minutes'::interval metrics: - queries: usage: "GAUGE" description: "Current number of queries that are stuck being idle in transactions" # All xid and lsn metrics here are reported mod 2^52 to ensure they # fit within a float for Prometheus :( Really annoying that counters # aren't stored in a 64-bit integer. Note that for queries that report # floats this only works because postgres_exporter does know to set # extra_float_digits (which it sets to 2). So they don't print in # exponential notation and precision is maintained up to 2^53-1. pg_vacuum: query: | SELECT COUNT(*) AS queries, MAX(EXTRACT(EPOCH FROM (clock_timestamp() - query_start))) AS age_in_seconds FROM pg_catalog.pg_stat_activity WHERE state = 'active' AND trim(query) ~* '\AVACUUM (?!ANALYZE)' metrics: - queries: usage: "GAUGE" description: "The current number of VACUUM queries" - age_in_seconds: usage: "GAUGE" description: "The current maximum VACUUM query age in seconds" pg_vacuum_analyze: query: | SELECT COUNT(*) AS queries, MAX(EXTRACT(EPOCH FROM (clock_timestamp() - query_start))) AS age_in_seconds FROM pg_catalog.pg_stat_activity WHERE state = 'active' AND trim(query) ~* '\AVACUUM ANALYZE' metrics: - queries: usage: "GAUGE" description: "The current number of VACUUM ANALYZE queries" - age_in_seconds: usage: "GAUGE" description: "The current maximum VACUUM ANALYZE query age in seconds" pg_stuck_idle_in_transaction: query: | SELECT COUNT(*) AS queries FROM pg_stat_activity WHERE state = 'idle in transaction' AND (now() - query_start) > '10 minutes'::interval metrics: - queries: usage: "GAUGE" description: "Current number of queries that are stuck being idle in transactions" pg_vacuum_queue: #master: true # Until postgres_exporter is upgraded. See # https://gitlab.com/gitlab-org/omnibus-gitlab/issues/4887 query: | with table_opts_vs_statistic as ( select pg_class.oid, pg_class.relname, coalesce(nspname, 'public') as schemaname, pg_class.relpages, pg_class.reltuples, case when array_to_string(reloptions, '') like '%autovacuum_vacuum_threshold%' then regexp_replace(array_to_string(reloptions, ''), '.*autovacuum_vacuum_threshold=([0-9.]+).*', E'\\1')::int8 else current_setting('autovacuum_vacuum_threshold')::int8 end as autovacuum_vacuum_threshold, case when array_to_string(reloptions, '') like '%autovacuum_vacuum_scale_factor%' then regexp_replace(array_to_string(reloptions, ''), '.*autovacuum_vacuum_scale_factor=([0-9.]+).*', E'\\1')::numeric else current_setting('autovacuum_vacuum_scale_factor')::numeric end as autovacuum_vacuum_scale_factor, case when array_to_string(reloptions, '') ~ 'autovacuum_enabled=(false|off)' then false else true end as autovacuum_enabled, n_dead_tup, last_autovacuum, last_vacuum from pg_class join pg_namespace ns on relnamespace = ns.oid join pg_stat_all_tables psat on psat.relid = pg_class.oid where relkind in ('r','m') ), p as ( select pgspv.*,a.query,a.wait_event_type,a.wait_event,a.query_start from pg_stat_progress_vacuum pgspv left join pg_stat_activity a using (pid) ) select table_opts_vs_statistic.schemaname as schemaname, table_opts_vs_statistic.relname as relname, round((100 * table_opts_vs_statistic.n_dead_tup::numeric / nullif(table_opts_vs_statistic.reltuples, 0))::numeric, 2) as dead_tup_pct, table_opts_vs_statistic.reltuples::numeric as reltuples, table_opts_vs_statistic.n_dead_tup, (relpages::bigint*8*1024) AS table_size_bytes, 'V. Threshold:' || table_opts_vs_statistic.autovacuum_vacuum_threshold || ', V. Scale Factor: ' || (table_opts_vs_statistic.autovacuum_vacuum_scale_factor)::numeric *100 ||' %' || case when not autovacuum_enabled then ', DISABLED' else ', enabled' end as "effective_settings", case when last_autovacuum > coalesce(last_vacuum, '0001-01-01') then last_autovacuum::timestamp(0) when last_vacuum is not null then last_vacuum::timestamp(0) else null end as "last_vacuumed", case when last_autovacuum > coalesce(last_vacuum, '0001-01-01') then 'auto' when last_vacuum is not null then 'manual' else null end as "type", coalesce(p.phase, 'in queue') as status, p.pid as pid, coalesce (p.query,'')as action, case when p.pid is null then null else coalesce(wait_event_type ||'.'|| wait_event, 'f') end as waiting, round(100.0 * p.heap_blks_scanned / nullif(p.heap_blks_total, 0), 1) AS scanned_pct, round(100.0 * p.heap_blks_vacuumed / nullif(p.heap_blks_total, 0), 1) AS vacuumed_pct, extract ('epoch' from now()-query_start) elapsed_time from table_opts_vs_statistic full outer join p on p.relid = table_opts_vs_statistic.oid and p.datname = current_database() where table_opts_vs_statistic.relpages >= 8 and autovacuum_vacuum_threshold + (autovacuum_vacuum_scale_factor::numeric * table_opts_vs_statistic.reltuples) < table_opts_vs_statistic.n_dead_tup metrics: - schemaname: usage: "LABEL" description: "Table Schema" - relname: usage: "LABEL" description: "Table name" - dead_tup_pct: usage: "GAUGE" description: "Estimated dead tuples percent" - reltuples: usage: "GAUGE" description: "Number of tuples in table" - n_dead_tup: usage: "GAUGE" description: "Estimated number of dead tuples" - table_size_bytes: usage: "GAUGE" description: "Estimated table size" - effective_settings: usage: "LABEL" description: "Autovacuums settings" - last_vacuumed: usage: "GAUGE" description: "Last time at which this table was vacuumed" - type: usage: "LABEL" description: "Last vacuum type" - status: usage: "LABEL" description: "Vacuum actual status" - pid: usage: "GAUGE" description: "Vacuum process id" - action: usage: "LABEL" description: "Type of vacuum executed" - waiting: usage: "LABEL" description: "Vacuum queue status" - scanned_pct: usage: "GAUGE" description: "Estimated rows scanned percent" - vacuumed_pct: usage: "GAUGE" description: "Estimated vacuumed rows percent" - elapsed_time: usage: "GAUGE" description: "Elapsed time vacuuming (in seconds)" # # This query extracts marginalia comments from pg_stat_activity and provides a sampled summary of the type of # endpoints that are actively making calls # pg_stat_activity_marginalia_sampler: query: > SELECT usename AS usename, a.matches[1] AS application, a.matches[2] AS endpoint, a.matches[3] AS command, a.wait_event AS wait_event, a.state AS state, a.wait_event_type AS wait_event_type, COUNT(*) active_count, MAX(coalesce(age_in_seconds, 0)) AS max_tx_age_in_seconds FROM ( SELECT usename, regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?.*?\*\/)?\s*(\w+)') AS matches, state, wait_event, wait_event_type, EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds FROM pg_stat_activity ) a GROUP BY usename, application, endpoint, command, state, wait_event, wait_event_type ORDER BY active_count DESC metrics: - usename: usage: LABEL description: The user running the command - application: usage: LABEL description: Name of the application as presented in marginalia comment - endpoint: usage: LABEL description: Name of the web endpoint or sidekiq job as presented in marginalia comment - command: usage: LABEL description: The first word for the running command - wait_event: usage: LABEL description: Wait event of the activity as presented by pg_stat_activity.wait_event - state: usage: LABEL description: State of the activity as presented by pg_stat_activity.state - wait_event_type: usage: LABEL description: Wait event type of the activity as presented by pg_stat_activity.wait_event_type - active_count: usage: GAUGE description: Number of active queries at time of sample - max_tx_age_in_seconds: usage: GAUGE description: Number of active queries at time of sample # This records long running autovacuum processes. The reason we limit to # long-running processes is to avoid cardinality problems in prometheus. Since # we're only concerned about long running processes we limit the output only to # those that take longer than 5 minutes. pg_stat_activity_autovacuum: query: > SELECT SPLIT_PART(query, '.', 2) AS relname, EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds FROM pg_stat_activity WHERE query like 'autovacuum:%' AND EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) > 300 metrics: - relname: usage: LABEL description: The table being vacuumed - age_in_seconds: usage: GAUGE description: The age of the vacuum process in seconds # Keep track of the number of the total number of autovacuum workers # currently active pg_stat_activity_autovacuum_active: query: > SELECT v.phase, CASE when a.query ~ '^autovacuum.*to prevent wraparound' then 'wraparound' when a.query ~* '^vacuum' then 'user' when a.pid is null then null ELSE 'regular' END as mode, count(1) as workers_count FROM pg_stat_progress_vacuum v LEFT JOIN pg_stat_activity a using (pid) GROUP BY 1,2 metrics: - phase: usage: LABEL description: Vacuum phase - mode: usage: LABEL description: Vacuum mode - workers_count: usage: GAUGE description: The number of active autovacuum workers in this state # # This query extracts marginalia metadata from pg_stat_activity and provides a sampled summary about the long running transactions. # pg_long_running_transactions_marginalia: query: > SELECT activity.matches[1] AS application, activity.matches[2] AS endpoint, MAX(age_in_seconds) AS max_age_in_seconds FROM ( SELECT regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?.*?\*\/)?\s*(\w+)') AS matches, EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds FROM pg_stat_activity WHERE state <> 'idle' AND (clock_timestamp() - xact_start) > '30 seconds'::interval AND query NOT LIKE 'autovacuum:%' ) activity GROUP BY application, endpoint ORDER BY max_age_in_seconds DESC metrics: - application: usage: LABEL description: Name of the application as presented in marginalia comment - endpoint: usage: LABEL description: Name of the web endpoint or sidekiq job as presented in marginalia comment - max_age_in_seconds: usage: GAUGE description: The current maximum transaction age in seconds