data-analysis-sql

data-analysis-sql

SQL for data analysis with exploratory analysis, advanced aggregations, statistical functions, outlier detection, and business insights. 50+ real-world analytics queries.

1étoiles
0forks
Mis à jour 1/5/2026
SKILL.md
readonlyread-only
name
data-analysis-sql
description

SQL for data analysis with exploratory analysis, advanced aggregations, statistical functions, outlier detection, and business insights. 50+ real-world analytics queries.

SQL for Data Analysis

Exploratory Data Analysis (EDA)

Data Profiling

-- Understand data structure and quality
SELECT COUNT(*) as record_count FROM employees;
SELECT COUNT(DISTINCT department) as unique_departments FROM employees;
SELECT COUNT(*) - COUNT(email) as missing_emails FROM employees;

-- Column value distribution
SELECT salary, COUNT(*) as frequency
FROM employees
GROUP BY salary
ORDER BY frequency DESC;

-- Missing data analysis
SELECT
  COUNT(*) as total_records,
  COUNT(phone) as non_null_phone,
  COUNT(*) - COUNT(phone) as missing_phone,
  ROUND(100.0 * (COUNT(*) - COUNT(phone)) / COUNT(*), 2) as missing_percentage
FROM employees;

-- Data type and range checks
SELECT
  MIN(salary) as min_salary,
  MAX(salary) as max_salary,
  ROUND(AVG(salary), 2) as avg_salary,
  ROUND(STDDEV(salary), 2) as salary_stddev
FROM employees;

Distribution Analysis

-- Value frequency distribution
SELECT
  department,
  COUNT(*) as emp_count,
  ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as percentage
FROM employees
GROUP BY department
ORDER BY emp_count DESC;

-- Salary ranges and distribution
SELECT
  CASE
    WHEN salary < 50000 THEN 'Under 50K'
    WHEN salary < 75000 THEN '50K-75K'
    WHEN salary < 100000 THEN '75K-100K'
    ELSE '100K+'
  END as salary_range,
  COUNT(*) as emp_count,
  MIN(salary) as min_sal,
  MAX(salary) as max_sal,
  ROUND(AVG(salary), 2) as avg_sal
FROM employees
GROUP BY salary_range
ORDER BY MIN(salary);

-- Distribution visualization data
SELECT
  salary,
  COUNT(*) as frequency,
  ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as pct,
  RPAD('*', COUNT(*) / 10, '*') as bar_chart
FROM employees
GROUP BY salary
ORDER BY salary;

Statistical Analysis

Summary Statistics

-- Comprehensive statistics by group
SELECT
  department,
  COUNT(*) as count,
  ROUND(AVG(salary), 2) as mean_salary,
  ROUND(MIN(salary), 2) as min_salary,
  ROUND(MAX(salary), 2) as max_salary,
  ROUND(STDDEV(salary), 2) as stddev_salary,
  ROUND(AVG(ABS(salary - (SELECT AVG(salary) FROM employees WHERE department = e.department))), 2) as avg_deviation
FROM employees e
GROUP BY department
ORDER BY mean_salary DESC;

-- Percentile analysis
SELECT
  department,
  ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary), 2) as q1,
  ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY salary), 2) as median,
  ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary), 2) as q3,
  ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary), 2) as p95
FROM employees
GROUP BY department;

Outlier Detection

-- Find outliers using standard deviation
SELECT
  emp_id,
  first_name,
  salary,
  ROUND(AVG(salary) OVER (), 2) as avg_salary,
  ROUND(STDDEV(salary) OVER (), 2) as stddev_salary,
  ROUND(ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0), 2) as z_score
FROM employees
HAVING ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0) > 3
ORDER BY z_score DESC;

-- IQR method for outliers
WITH salary_stats AS (
  SELECT
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) as q1,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) as q3
  FROM employees
)
SELECT
  emp_id,
  salary,
  CASE
    WHEN salary < (SELECT q1 FROM salary_stats) - 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats))
    OR salary > (SELECT q3 FROM salary_stats) + 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats))
    THEN 'Outlier'
    ELSE 'Normal'
  END as outlier_status
FROM employees;

Comparative Analysis

Period-over-Period Comparison

-- Year-over-year sales comparison
SELECT
  EXTRACT(QUARTER FROM order_date) as quarter,
  EXTRACT(YEAR FROM order_date) as year,
  ROUND(SUM(amount), 2) as total_sales,
  ROUND(LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as prev_period,
  ROUND(SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_change,
  ROUND(100.0 * (SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))) / LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_pct_change
FROM orders
GROUP BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)
ORDER BY year, quarter;

Cohort Analysis

-- User cohort analysis
WITH user_cohorts AS (
  SELECT
    DATE_TRUNC('month', first_order_date)::DATE as cohort_month,
    user_id,
    DATE_TRUNC('month', order_date)::DATE as order_month
  FROM users u
  LEFT JOIN orders o ON u.id = o.user_id
)
SELECT
  cohort_month,
  DATE_PART('month', order_month - cohort_month) / 1 as months_since_cohort,
  COUNT(DISTINCT user_id) as users,
  ROUND(100.0 * COUNT(DISTINCT user_id) /
    (SELECT COUNT(DISTINCT user_id) FROM user_cohorts WHERE order_month = cohort_month), 2) as retention_rate
FROM user_cohorts
WHERE order_month >= cohort_month
GROUP BY cohort_month, months_since_cohort
ORDER BY cohort_month, months_since_cohort;

Correlation & Relationship Analysis

-- Correlation between variables
WITH salary_data AS (
  SELECT
    years_experience,
    salary,
    AVG(salary) OVER () as avg_salary,
    AVG(years_experience) OVER () as avg_experience,
    STDDEV(salary) OVER () as stddev_salary,
    STDDEV(years_experience) OVER () as stddev_experience
  FROM employees
)
SELECT
  ROUND(
    SUM((years_experience - avg_experience) * (salary - avg_salary)) /
    (COUNT(*) * stddev_salary * stddev_experience),
    4
  ) as correlation
FROM salary_data;

-- Segment analysis
SELECT
  CASE
    WHEN years_experience < 2 THEN 'Junior'
    WHEN years_experience < 5 THEN 'Mid-level'
    WHEN years_experience < 10 THEN 'Senior'
    ELSE 'Expert'
  END as experience_level,
  COUNT(*) as count,
  ROUND(AVG(salary), 2) as avg_salary,
  ROUND(AVG(performance_rating), 2) as avg_rating
FROM employees
GROUP BY experience_level
ORDER BY COUNT(*) DESC;

Data Quality Validation

-- Check for invalid values
SELECT
  CASE
    WHEN salary < 0 THEN 'Negative salary'
    WHEN salary > 1000000 THEN 'Unusually high salary'
    WHEN email NOT LIKE '%@%' THEN 'Invalid email'
    WHEN hire_date > CURRENT_DATE THEN 'Future hire date'
    WHEN years_experience > 70 THEN 'Impossible experience'
    ELSE NULL
  END as data_quality_issue,
  COUNT(*) as count
FROM employees
WHERE salary < 0
  OR salary > 1000000
  OR email NOT LIKE '%@%'
  OR hire_date > CURRENT_DATE
  OR years_experience > 70
GROUP BY data_quality_issue;

-- Duplicate detection
SELECT
  email,
  COUNT(*) as occurrence_count,
  STRING_AGG(DISTINCT emp_id::text, ', ') as emp_ids
FROM employees
WHERE email IS NOT NULL
GROUP BY email
HAVING COUNT(*) > 1
ORDER BY occurrence_count DESC;

Trend Analysis

-- Moving average
SELECT
  order_date,
  amount,
  ROUND(AVG(amount) OVER (
    ORDER BY order_date
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
  ), 2) as moving_avg_7day,
  ROUND(AVG(amount) OVER (
    ORDER BY order_date
    ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
  ), 2) as moving_avg_30day
FROM daily_orders
ORDER BY order_date;

-- Growth rate
SELECT
  DATE_TRUNC('month', order_date)::DATE as month,
  ROUND(SUM(amount), 2) as monthly_revenue,
  ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date))) /
    LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date)) * 100, 2) as growth_rate_pct
FROM orders
GROUP BY DATE_TRUNC('month', order_date)
ORDER BY month;

Next Steps

Learn advanced SQL concepts and optimization techniques in the advanced-sql skill.

You Might Also Like

Related Skills

zig-system-calls

zig-system-calls

87Kdev-database

Guides using bun.sys for system calls and file I/O in Zig. Use when implementing file operations instead of std.fs or std.posix.

oven-sh avataroven-sh
Obtenir
bun-file-io

bun-file-io

86Kdev-database

Use this when you are working on file operations like reading, writing, scanning, or deleting files. It summarizes the preferred file APIs and patterns used in this repo. It also notes when to use filesystem helpers for directories.

anomalyco avataranomalyco
Obtenir
vector-index-tuning

vector-index-tuning

26Kdev-database

Optimize vector index performance for latency, recall, and memory. Use when tuning HNSW parameters, selecting quantization strategies, or scaling vector search infrastructure.

wshobson avatarwshobson
Obtenir

Implement efficient similarity search with vector databases. Use when building semantic search, implementing nearest neighbor queries, or optimizing retrieval performance.

wshobson avatarwshobson
Obtenir

Master dbt (data build tool) for analytics engineering with model organization, testing, documentation, and incremental strategies. Use when building data transformations, creating data models, or implementing analytics engineering best practices.

wshobson avatarwshobson
Obtenir
event-store-design

event-store-design

26Kdev-database

Design and implement event stores for event-sourced systems. Use when building event sourcing infrastructure, choosing event store technologies, or implementing event persistence patterns.

wshobson avatarwshobson
Obtenir