SQL: my workflow

SQL

My workflow on exploring new datasets

Author

Chi Zhang

Published

January 7, 2025

Summary on one table

get to know what variables (columns) are available in the table -> print first n rows (potential 1)
number of rows with COUNT(*), combined with conditions

-- prints first row
SELECT * FROM data LIMIT 1 

-- prints nrow
SELECT COUNT(*) FROM data 

-- number of the first names that start with "C"
SELECT COUNT(*) FROM data WHERE first_name LIKE "C%"

Subsetting with IN syntax. It also works on texts.

SELECT *
FROM patients 
WHERE patient_id IN (1, 45, 534, 879, 1000)

grouping by ID, then count records per ID (no need to compile the inner data first)

SELECT
  patient_id,
  COUNT(*) AS total_admissions
FROM admissions
WHERE patient_id = 579;

Data quality

check missing (NULL)

Numerial column

range of values, with max and min
combine with values derived from inside the table, say the maximum of a column

SELECT
  first_name,
  last_name,
  height
FROM patients
WHERE height = (
    SELECT max(height)
    FROM patients
  )

Date

select year that starts from a certain value, like 2020

SELECT count(*) 
FROM patients 
WHERE birth_date like "2010%"

-- the following should also work
-- WHERE YEAR(birth_date) = '2010'
-- WHERE EXTRACT(YEAR from birth_date) = '2010'
-- WHERE substring(birth_date, 4) = '2010'

Text

unique categories
text length with LEN()
get the number of unique values
- select count directly
- WITH clause

SELECT DISTINCT xvar FROM data

-- solution 1
SELECT COUNT(allergies) FROM (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)

-- solution 2 (with clause)
WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL) 

SELECT COUNT(allergies) FROM alls

-- get the id and firstname where it starts and end with s, with length greater or equal to 6
SELECT patient_id, first_name 
FROM patients
WHERE first_name LIKE "S%s" 
AND LEN(first_name)>=6

Multiple tables: JOIN

SELECT first_name, last_name, province_name 
FROM patients LEFT JOIN province_names
ON patients.province_id = province_names.province_id

Subquery

`WITH` and `HAVING`

WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL) 

SELECT COUNT(allergies) FROM alls

WITH tmp AS (
  select patient_id, diagnosis, count(*) as freq
    from admissions
group by patient_id, diagnosis)

select patient_id, diagnosis from tmp where freq >1

-- equivalent to 

SELECT
  patient_id,
  diagnosis
FROM admissions
GROUP BY
  patient_id,
  diagnosis
HAVING COUNT(*) > 1

Column-wise presentation

SELECT does not need to always end with FROM at the very end, it can combine results from other selects.

select 
(select count(*) FROM patients where gender = 'M') as male_count, 
(select count(*) FROM patients where gender = 'F') as female_count