SQL: my workflow
SQL
My workflow on exploring new datasets
Summary on one table
- get to know what variables (columns) are available in the table -> print first n rows (potential 1)
- number of rows with
COUNT(*)
, combined with conditions
-- prints first row
SELECT * FROM data LIMIT 1
-- prints nrow
SELECT COUNT(*) FROM data
-- number of the first names that start with "C"
SELECT COUNT(*) FROM data WHERE first_name LIKE "C%"
Subsetting with IN
syntax. It also works on texts.
SELECT *
FROM patients
WHERE patient_id IN (1, 45, 534, 879, 1000)
grouping by ID, then count records per ID (no need to compile the inner data first)
SELECT
patient_id,COUNT(*) AS total_admissions
FROM admissions
WHERE patient_id = 579;
Data quality
check missing (NULL)
Numerial column
- range of values, with
max
andmin
- combine with values derived from inside the table, say the maximum of a column
SELECT
first_name,
last_name,
heightFROM patients
WHERE height = (
SELECT max(height)
FROM patients
)
Date
select year that starts from a certain value, like 2020
SELECT count(*)
FROM patients
WHERE birth_date like "2010%"
-- the following should also work
-- WHERE YEAR(birth_date) = '2010'
-- WHERE EXTRACT(YEAR from birth_date) = '2010'
-- WHERE substring(birth_date, 4) = '2010'
Text
- unique categories
- text length with
LEN()
- get the number of unique values
- select count directly
WITH
clause
SELECT DISTINCT xvar FROM data
-- solution 1
SELECT COUNT(allergies) FROM (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
-- solution 2 (with clause)
WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
SELECT COUNT(allergies) FROM alls
-- get the id and firstname where it starts and end with s, with length greater or equal to 6
SELECT patient_id, first_name
FROM patients
WHERE first_name LIKE "S%s"
AND LEN(first_name)>=6
Multiple tables: JOIN
SELECT first_name, last_name, province_name
FROM patients LEFT JOIN province_names
ON patients.province_id = province_names.province_id
Subquery
WITH
and HAVING
WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
SELECT COUNT(allergies) FROM alls
WITH tmp AS (
select patient_id, diagnosis, count(*) as freq
from admissions
group by patient_id, diagnosis)
select patient_id, diagnosis from tmp where freq >1
-- equivalent to
SELECT
patient_id,
diagnosisFROM admissions
GROUP BY
patient_id,
diagnosisHAVING COUNT(*) > 1
Column-wise presentation
SELECT
does not need to always end with FROM
at the very end, it can combine results from other selects.
select
select count(*) FROM patients where gender = 'M') as male_count,
(select count(*) FROM patients where gender = 'F') as female_count (