I'm working on modernizing Rosetta Code's infrastructure. Starting with communications. Please accept this time-limited open invite to RC's Slack.. --Michael Mol (talk) 20:59, 30 May 2020 (UTC)

Merge and aggregate datasets

From Rosetta Code
Task
Merge and aggregate datasets
You are encouraged to solve this task according to the task description, using any language you may know.
Merge and aggregate datasets
Task

Merge and aggregate two datasets as provided in .csv files into a new resuling dataset. Use the appropriate methods and data structures depending on the programming language. Use the most common libraries only when built-in functionality is not sufficient.
Note: Either load the data from the .csv files or create the required data structures hard-coded.

patients.csv file contents:

PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz

visits.csv file contents:

PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3

Create a resulting dataset in-memory or output it to screen or file, whichever is appropriate for the programming language at hand. Merge and group per patient id and last name, get the maximum visit date and get the sum and average of the scores per patient to get the resulting datasset. Note that the visit date is purposefully provided as ISO format, so that it could also be processed as text and sorted alphabetically to determine the maximum date.

| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 6.80      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |

Note

This task is aimed in particular at programming languages that are used in data science and data processing, such as F#, Python, R, SPSS, MATLAB etc.

Related tasks


C++[edit]

Uses C++20

#include <iostream>
#include <optional>
#include <ranges>
#include <string>
#include <vector>
 
using namespace std;
 
struct Patient
{
string ID;
string LastName;
};
 
struct Visit
{
string PatientID;
string Date;
optional<float> Score;
};
 
int main(void)
{
auto patients = vector<Patient> {
{"1001", "Hopper"},
{"4004", "Wirth"},
{"3003", "Kemeny"},
{"2002", "Gosling"},
{"5005", "Kurtz"}};
 
auto visits = vector<Visit> {
{"2002", "2020-09-10", 6.8},
{"1001", "2020-09-17", 5.5},
{"4004", "2020-09-24", 8.4},
{"2002", "2020-10-08", },
{"1001", "" , 6.6},
{"3003", "2020-11-12", },
{"4004", "2020-11-05", 7.0},
{"1001", "2020-11-19", 5.3}};
 
// sort the patients by ID
sort(patients.begin(), patients.end(),
[](const auto& a, const auto&b){ return a.ID < b.ID;});
 
cout << "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n";
for(const auto& patient : patients)
{
// loop over all of the patients and determine the fields
string lastVisit;
float sum = 0;
int numScores = 0;
 
// use C++20 ranges to filter the visits by patients
auto patientFilter = [&patient](const Visit &v){return v.PatientID == patient.ID;};
for(const auto& visit : visits | views::filter( patientFilter ))
{
if(visit.Score)
{
sum += *visit.Score;
numScores++;
}
lastVisit = max(lastVisit, visit.Date);
}
 
// format the output
cout << "| " << patient.ID << " | ";
cout.width(8); cout << patient.LastName << " | ";
cout.width(10); cout << lastVisit << " | ";
if(numScores > 0)
{
cout.width(9); cout << sum << " | ";
cout.width(9); cout << (sum / float(numScores));
}
else cout << " | ";
cout << " |\n";
}
}
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
|       1001 |   Hopper | 2020-11-19 |      17.4 |       5.8 |
|       2002 |  Gosling | 2020-10-08 |       6.8 |       6.8 |
|       3003 |   Kemeny | 2020-11-12 |           |           |
|       4004 |    Wirth | 2020-11-05 |      15.4 |       7.7 |
|       5005 |    Kurtz |            |           |           |

F#[edit]

Note that the scores are right justified to copy the task description. It would be more natural to leave them right justified.

 
// Merge and aggregate datasets. Nigel Galloway: January 6th., 2021
let rFile(fName)=seq{use n=System.IO.File.OpenText(fName)
n.ReadLine() |> ignore
while not n.EndOfStream do yield n.ReadLine().Split [|','|]}
let N=rFile("file1.txt") |> Seq.sort
let G=rFile("file2.txt") |> Seq.groupBy(fun n->n.[0]) |> Map.ofSeq
let fN n i g e l=printfn "| %-10s | %-8s | %10s |  %-9s | %-9s |" n i g e l
let fG n g=let z=G.[n]|>Seq.sumBy(fun n->try float n.[2] with :? System.FormatException->0.0)
fN n g (G.[n]|>Seq.sort|>Seq.last).[1] (if z=0.0 then "" else string z) (if z=0.0 then "" else string(z/(float(Seq.length G.[n]))))
 
Output:
printfn "| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |"
N|>Seq.iter(fun n->match G.ContainsKey n.[0] with true->fG n.[0] n.[1] |_->fN n.[0] n.[1] "" "" "")

| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 |  17.4      | 5.8       |
| 2002       | Gosling  | 2020-10-08 |  6.8       | 3.4       |
| 3003       | Kemeny   | 2020-11-12 |            |           |
| 4004       | Wirth    | 2020-11-05 |  15.4      | 7.7       |
| 5005       | Kurtz    |            |            |           |

Go[edit]

Translation of: Wren
package main
 
import (
"fmt"
"math"
"sort"
)
 
type Patient struct {
id int
lastName string
}
 
// maps an id to a lastname
var patientDir = make(map[int]string)
 
// maintains a sorted list of ids
var patientIds []int
 
func patientNew(id int, lastName string) Patient {
patientDir[id] = lastName
patientIds = append(patientIds, id)
sort.Ints(patientIds)
return Patient{id, lastName}
}
 
type DS struct {
dates []string
scores []float64
}
 
type Visit struct {
id int
date string
score float64
}
 
// maps an id to lists of dates and scores
var visitDir = make(map[int]DS)
 
func visitNew(id int, date string, score float64) Visit {
if date == "" {
date = "0000-00-00"
}
v, ok := visitDir[id]
if ok {
v.dates = append(v.dates, date)
v.scores = append(v.scores, score)
visitDir[id] = DS{v.dates, v.scores}
} else {
visitDir[id] = DS{[]string{date}, []float64{score}}
}
return Visit{id, date, score}
}
 
type Merge struct{ id int }
 
func (m Merge) lastName() string { return patientDir[m.id] }
func (m Merge) dates() []string { return visitDir[m.id].dates }
func (m Merge) scores() []float64 { return visitDir[m.id].scores }
 
func (m Merge) lastVisit() string {
dates := m.dates()
dates2 := make([]string, len(dates))
copy(dates2, dates)
sort.Strings(dates2)
return dates2[len(dates2)-1]
}
 
func (m Merge) scoreSum() float64 {
sum := 0.0
for _, score := range m.scores() {
if score != -1 {
sum += score
}
}
return sum
}
 
func (m Merge) scoreAvg() float64 {
count := 0
for _, score := range m.scores() {
if score != -1 {
count++
}
}
return m.scoreSum() / float64(count)
}
 
func mergePrint(merges []Merge) {
fmt.Println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
f := "| %d | %-7s | %s | %4s | %4s |\n"
for _, m := range merges {
_, ok := visitDir[m.id]
if ok {
lv := m.lastVisit()
if lv == "0000-00-00" {
lv = " "
}
scoreSum := m.scoreSum()
ss := fmt.Sprintf("%4.1f", scoreSum)
if scoreSum == 0 {
ss = " "
}
scoreAvg := m.scoreAvg()
sa := " "
if !math.IsNaN(scoreAvg) {
sa = fmt.Sprintf("%4.2f", scoreAvg)
}
fmt.Printf(f, m.id, m.lastName(), lv, ss, sa)
} else {
fmt.Printf(f, m.id, m.lastName(), " ", " ", " ")
}
}
}
 
func main() {
patientNew(1001, "Hopper")
patientNew(4004, "Wirth")
patientNew(3003, "Kemeny")
patientNew(2002, "Gosling")
patientNew(5005, "Kurtz")
 
visitNew(2002, "2020-09-10", 6.8)
visitNew(1001, "2020-09-17", 5.5)
visitNew(4004, "2020-09-24", 8.4)
visitNew(2002, "2020-10-08", -1) // -1 signifies no score
visitNew(1001, "", 6.6) // "" signifies no date
visitNew(3003, "2020-11-12", -1)
visitNew(4004, "2020-11-05", 7.0)
visitNew(1001, "2020-11-19", 5.3)
 
merges := make([]Merge, len(patientIds))
for i, id := range patientIds {
merges[i] = Merge{id}
}
mergePrint(merges)
}
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |

Julia[edit]

Translation of: Python
using CSV, DataFrames, Statistics
 
# load data from csv files
#df_patients = CSV.read("patients.csv", DataFrame)
#df_visits = CSV.read("visits.csv", DataFrame)
 
# create DataFrames from text that is hard coded, so use IOBuffer(String) as input
str_patients = IOBuffer("""PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
""")
df_patients = CSV.read(str_patients, DataFrame)
str_visits = IOBuffer("""PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
""")
df_visits = CSV.read(str_visits, DataFrame)
 
# merge on PATIENT_ID, using an :outer join or we lose Kurtz, who has no data, sort by ID
df_merge = sort(join(df_patients, df_visits, on="PATIENT_ID", kind=:outer), (:PATIENT_ID,))
 
fnonmissing(a, f) = isempty(a) ? [] : isempty(skipmissing(a)) ? a[1] : f(skipmissing(a))
 
# group by patient id / last name and then aggregate to get latest visit and mean score
df_result = by(df_merge, [:PATIENT_ID, :LASTNAME]) do df
DataFrame(LATEST_VISIT = fnonmissing(df[:VISIT_DATE], maximum),
SUM_SCORE = fnonmissing(df[:SCORE], sum),
MEAN_SCORE = fnonmissing(df[:SCORE], mean))
end
println(df_result)
 
Output:
5×5 DataFrame
│ Row │ PATIENT_ID │ LASTNAME │ LATEST_VISIT │ SUM_SCORE │ MEAN_SCORE │
│     │ Int64?     │ String?  │ Dates.Date?  │ Float64?  │ Float64?   │
├─────┼────────────┼──────────┼──────────────┼───────────┼────────────┤
│ 1   │ 1001       │ Hopper   │ 2020-11-19   │ 17.4      │ 5.8        │
│ 2   │ 2002       │ Gosling  │ 2020-10-08   │ 6.8       │ 6.8        │
│ 3   │ 3003       │ Kemeny   │ 2020-11-12   │ missing   │ missing    │
│ 4   │ 4004       │ Wirth    │ 2020-11-05   │ 15.4      │ 7.7        │
│ 5   │ 5005       │ Kurtz    │ missing      │ missing   │ missing    │

Phix[edit]

constant patients_txt = split("""
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz""",'\n'),
visits_txt = split("""
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3""",'\n')
--or, assuming these files contain exactly the same actual raw text,
-- and the use of GT_LF_STRIPPED is instead of above split('\n'):
--constant patients_txt = get_text("patients.csv",GT_LF_STRIPPED),
-- visits_txt = get_text("visits.csv",GT_LF_STRIPPED)
 
function sap(string s) return split(s,',')&{"",0,0} end function
function svp(string s) return split(s,',') end function
sequence patient_data = sort(apply(patients_txt[2..$],sap)),
visit_data = sort_columns(apply(visits_txt[2..$],svp),{1,-2})
visit_data = append(visit_data,{"","","0"}) -- (add a sentinel)
string last_id = "",id,name,dt,scstr,avstr
atom score,score_total,average
integer visit_count = 0, pdx = 1
for i=1 to length(visit_data) do
{id,dt,scstr} = visit_data[i]
score = iff(scstr=""?0:scanf(scstr,"%f")[1][1])
if id!=last_id then
if visit_count then
average = score_total/visit_count
patient_data[pdx][4..5] = {score_total,average}
end if
if i=length(visit_data) then exit end if -- (sentinel)
score_total = score
visit_count = (score!=0)
while id!=patient_data[pdx][1] do pdx += 1 end while
patient_data[pdx][3] = dt
last_id = id
elsif score!=0 then
score_total += score
visit_count += 1
end if
end for
printf(1,"| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n")
for i=1 to length(patient_data) do
{id,name,dt,score,average} = patient_data[i]
scstr = iff(score=0?"":sprintf("%4.1f",score))
avstr = iff(average=0?"":sprintf("%4.2f",average))
printf(1,"| %-10s |  %-7s | %10s |  %-9s | %-9s |\n",
{id,name,dt,scstr,avstr})
end for
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 6.80      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |

Perl[edit]

Weird requirement: "Use the most common libraries only when built-in functionality is not sufficient." Not even a "use strict;" :)

#!/usr/bin/perl
my $fmt = '| %-11s' x 5 . "|\n";
printf $fmt, qw( PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG);
my ($names, $visits) = do { local $/; split /^\n/m, <DATA> };
my %score;
for ( $visits =~ /^\d.*/gm )
{
my ($id, undef, $score) = split /,/;
$score{$id} //= ['', ''];
$score and $score{$id}[0]++, $score{$id}[1] += $score;
}
for ( sort $names =~ /^\d.*/gm )
{
my ($id, $name) = split /,/;
printf $fmt, $id, $name, ( sort $visits =~ /^$id,(.*?),/gm, '' )[-1],
$score{$id}[0]
? ( $score{$id}[1], $score{$id}[1] / $score{$id}[0])
: ('', '');
}
 
__DATA__
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
 
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
Output:
| PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
| 1001       | Hopper     | 2020-11-19 | 17.4       | 5.8        |
| 2002       | Gosling    | 2020-10-08 | 6.8        | 6.8        |
| 3003       | Kemeny     | 2020-11-12 |            |            |
| 4004       | Wirth      | 2020-11-05 | 15.4       | 7.7        |
| 5005       | Kurtz      |            |            |            |

Python[edit]

Python: Using pandas library[edit]

# to install pandas library go to cmd prompt and type:
# cd %USERPROFILE%\AppData\Local\Programs\Python\Python38-32\Scripts\
# pip install pandas
import pandas as pd
 
# load data from csv files
df_patients = pd.read_csv (r'patients.csv', sep = ",", decimal=".")
df_visits = pd.read_csv (r'visits.csv', sep = ",", decimal=".")
 
''' # load data hard coded, create data frames
import io
str_patients = """PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
"""
df_patients = pd.read_csv(io.StringIO(str_patients), sep = ",", decimal=".")
str_visits = """PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
"""
df_visits = pd.read_csv(io.StringIO(str_visits), sep = ",", decimal=".")
'''

 
# typecast from string to datetime so .agg can 'max' it
df_visits['VISIT_DATE'] = pd.to_datetime(df_visits['VISIT_DATE'])
 
# merge on PATIENT_ID
df_merge = df_patients.merge(df_visits, on='PATIENT_ID', how='left')
 
# groupby is an intermediate object
df_group = df_merge.groupby(['PATIENT_ID','LASTNAME'], as_index=False)
 
# note: you can use 'sum' instead of the lambda function but that returns NaN as 0 (zero)
df_result = df_group.agg({'VISIT_DATE': 'max', 'SCORE': [lambda x: x.sum(min_count=1),'mean']})
 
print(df_result)
  PATIENT_ID LASTNAME LAST_VISIT      SCORE
                             max <lambda_0> mean
0       1001   Hopper 2020-11-19       17.4  5.8
1       2002  Gosling 2020-10-08        6.8  6.8
2       3003   Kemeny 2020-11-12        NaN  NaN
3       4004    Wirth 2020-11-05       15.4  7.7
4       5005    Kurtz        NaT        NaN  NaN

Python: Stdlib csv only[edit]

Using only standard libraries and input from csv files.

import csv
 
fnames = 'patients.csv patients_visits.csv'.split()
 
def csv2list(fname):
with open(fname) as f:
rows = list(csv.reader(f))
return rows
 
patients, visits = data = [csv2list(fname) for fname in fnames]
result = [record.copy() for record in patients]
result[1:] = sorted(result[1:])
#%%
result[0].append('LAST_VISIT')
last = {p: vis for p, vis, *score in visits[1:]}
for record in result[1:]:
p = record[0]
record.append(last.get(p, ''))
#%%
result[0] += ['SCORE_SUM', 'SCORE_AVG']
n = {p: 0 for p, *_ in patients[1:]}
tot = n.copy()
for record in visits[1:]:
p, _, score = record
if score:
n[p] += 1
tot[p] += float(score)
for record in result[1:]:
p = record[0]
if n[p]:
record += [f"{tot[p]:5.1f}", f"{tot[p] / n[p]:5.2f}"]
else:
record += ['', '']
#%%
for record in result:
print(f"| {' | '.join(f'{r:^10}' for r in record)} |")
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.80    |
|    2002    |  Gosling   | 2020-10-08 |     6.8    |    6.80    |
|    3003    |   Kemeny   | 2020-11-12 |            |            |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.70    |
|    5005    |   Kurtz    |            |            |            |


Python: Stdlib sqlite3 and csv only[edit]

Using the csv module just to parse inputs; and the sqlite3 module, (which which is also a standard library that comes with the base Python install), to calculate the output.
(The SQL SELECT statement is modelled on that of the SQL entry elsewhere on this page).

import sqlite3
import csv
 
 
fnames = 'patients.csv patients_visits.csv'.split()
conn = sqlite3.connect(":memory:")
#%%
 
def create_table_headers(conn):
curs = conn.cursor()
curs.execute('''
CREATE TABLE patients(PATIENT_ID INT, LASTNAME TEXT);
'''
)
curs.execute('''
CREATE TABLE patients_visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));
'''
)
conn.commit()
 
def fill_tables(conn, fnames):
curs = conn.cursor()
for fname in fnames:
with open(fname) as f:
tablename = fname.replace('.csv', '')
#
csvdata = csv.reader(f)
header = next(csvdata)
fields = ','.join('?' for _ in header)
for row in csvdata:
row = [(None if r == '' else r) for r in row]
curs.execute(f"INSERT INTO {tablename} VALUES ({fields});", row)
conn.commit()
 
def join_tables_and_group(conn):
curs = conn.cursor()
curs.execute('''
CREATE TABLE answer AS
SELECT
patients.PATIENT_ID,
patients.LASTNAME,
MAX(VISIT_DATE) AS LAST_VISIT,
SUM(SCORE) AS SCORE_SUM,
CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG
FROM
patients
LEFT JOIN patients_visits
ON patients_visits.PATIENT_ID = patients.PATIENT_ID
GROUP BY
patients.PATIENT_ID,
patients.LASTNAME
ORDER BY
patients.PATIENT_ID;
'''
)
curs.execute('''
SELECT * FROM answer;
'''
)
conn.commit()
rows = list(curs.fetchall())
headers = tuple(d[0] for d in curs.description)
return [headers] + rows
 
create_table_headers(conn)
fill_tables(conn, fnames)
result = join_tables_and_group(conn)
for record in result:
print(f"| {' | '.join(f'{str(r):^10}' for r in record)} |")
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.8     |
|    2002    |  Gosling   | 2020-10-08 |    6.8     |    6.8     |
|    3003    |   Kemeny   | 2020-11-12 |    None    |    None    |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.7     |
|    5005    |   Kurtz    |    None    |    None    |    None    |

R[edit]

# load data from csv files
# setwd("C:\Temp\csv\")
# df_patient <- read.csv(file="patients.csv", header = TRUE, sep = ",")
# df_visits <- read.csv(file="visits.csv", header = TRUE, sep = ",", dec = ".", colClasses=c("character","character","numeric"))
 
# load data hard coded, create data frames
df_patient <- read.table(text = "
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
", header = TRUE, sep = ",") # character fields so no need for extra parameters colClasses etc.
 
df_visits <- read.table(text = "
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
", header = TRUE, dec = ".", sep = ",", colClasses=c("character","character","numeric"))
 
# aggregate visit date and scores
df_agg <- data.frame(
cbind(
PATIENT_ID = names(tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE)),
last_visit = tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE),
score_sum = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), sum, na.rm=TRUE),
score_avg = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), mean, na.rm=TRUE)
)
)
 
# merge patients and aggregate dataset
# all.x = all the non matching cases of df_patient are appended to the result as well (i.e. 'left join')
df_result <- merge(df_patient, df_agg, by = 'PATIENT_ID', all.x = TRUE)
 
print(df_result)
Output:
  PATIENT_ID LASTNAME last_visit score_sum score_avg
1       1001   Hopper 2020-11-19      17.4       5.8
2       2002  Gosling 2020-10-08       6.8       6.8
3       3003   Kemeny 2020-11-12         0       NaN
4       4004    Wirth 2020-11-05      15.4       7.7
5       5005    Kurtz       <NA>      <NA>      <NA>

Raku[edit]

my @names = map { ( <PATIENT_ID LASTNAME> Z=> .list ).hash },
( 1001, 'Hopper' ),
( 4004, 'Wirth' ),
( 3003, 'Kemeny' ),
( 2002, 'Gosling' ),
( 5005, 'Kurtz' ),
;
my @visits = map { ( <PATIENT_ID VISIT_DATE SCORE> Z=> .list ).hash },
( 2002, '2020-09-10', 6.8 ),
( 1001, '2020-09-17', 5.5 ),
( 4004, '2020-09-24', 8.4 ),
( 2002, '2020-10-08', Nil ),
( 1001, Nil , 6.6 ),
( 3003, '2020-11-12', Nil ),
( 4004, '2020-11-05', 7.0 ),
( 1001, '2020-11-19', 5.3 ),
;
 
my %v = @visits.classify: *.<PATIENT_ID>;
 
my @result = gather for @names -> %n {
my @p = %v{ %n.<PATIENT_ID> }<>;
 
my @dates = @p».<VISIT_DATE>.grep: *.defined;
my @scores = @p».< SCORE>.grep: *.defined;
 
take {
%n,
LAST_VISIT => ( @dates.max if @dates ),
SCORE_AVG => ( @scores.sum/@scores if @scores ),
SCORE_SUM => ( @scores.sum if @scores ),
};
}
 
my @out_field_names = <PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG>;
my @rows = @result.sort(*.<PATIENT_ID>).map(*.{@out_field_names});
say .map({$_ // ''}).fmt('%-10s', ' | ') for @out_field_names, |@rows;
Output:
PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG 
1001       | Hopper     | 2020-11-19 | 17.4       | 5.8       
2002       | Gosling    | 2020-10-08 | 6.8        | 6.8       
3003       | Kemeny     | 2020-11-12 |            |           
4004       | Wirth      | 2020-11-05 | 15.4       | 7.7       
5005       | Kurtz      |            |            |           

REXX[edit]

/* REXX */
patients='patients.csv'
l=linein(patients)
Parse Var l h1 ',' h2
n=0
idl=''
Do n=1 By 1 While lines(patients)>0
l=linein(patients)
Parse Var l id ',' lastname.id
idl=idl id
End
n=n-1 /* number of patients */
visits='visits.csv'
l=linein(visits) /* skip the header line of this file */
h3='LAST_VISIT'
h4='SCORE_SUM'
h5='SCORE_AVG'
date.=''
score.=0
Say '|' h1 '|' h2 '|' h3 '|' h4 '|' h5 '|'
Do While lines(visits)>0
l=linein(visits)
Parse Var l id ',' date ',' score
if date>date.id Then date.id=date
If score>'' Then Do
z=score.id.0+1
score.id.z=score
score.id.0=z
End
end
idl=wordsort(idl)
Do While idl<>''
Parse Var idl id idl
If date.id='' Then date.id=copies(' ',10)
ol='|' left(id,length(h1)) '|' left(lastname.id,length(h2)),
'|' left(date.id,length(h3))
If score.id.0=0 Then Do
ol=ol '|' left(' ',length(h4)) '|',
left(' ',length(h5)) '|'
score_sum=copies(' ',length(h4))
score_avg=copies(' ',length(h4))
End
Else Do
score_sum=0
Do j=1 To score.id.0
score_sum=score_sum+score.id.j
End
score_avg=score_sum/score.id.0
ol=ol '|' left(format(score_sum,2,1),length(h4)) '|',
left(format(score_avg,2,1),length(h5)) '|'
End
Say ol
End
Exit
 
wordsort: Procedure
/**********************************************************************
* Sort the list of words supplied as argument. Return the sorted list
**********************************************************************/

Parse Arg wl
wa.=''
wa.0=0
Do While wl<>''
Parse Var wl w wl
Do i=1 To wa.0
If wa.i>w Then Leave
End
If i<=wa.0 Then Do
Do j=wa.0 To i By -1
ii=j+1
wa.ii=wa.j
End
End
wa.i=w
wa.0=wa.0+1
End
swl=''
Do i=1 To wa.0
swl=swl wa.i
End
Return strip(swl)
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      |  5.8      |
| 2002       | Gosling  | 2020-10-08 |  6.8      |  6.8      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      |  7.7      |
| 5005       | Kurtz    |            |           |           |

SPSS[edit]

Note: the PATIENT_ID 5005 is missing from the result, because MATCH FILES cannot add rows for missing keys, i.e. AFAIK a "left join" is not possible is SPSS.

* set working directory to location of .csv files
 
CD 'C:\Temp\csv\'.
 
* load patients csv data
 
GET DATA /TYPE=TXT
/FILE="patients.csv"
/ENCODING='UTF8'
/DELCASE=LINE
/DELIMITERS=","
/QUALIFIER='"'
/ARRANGEMENT=DELIMITED
/FIRSTCASE=2
/IMPORTCASE=ALL
/VARIABLES=
PATIENT_ID F5.0
LASTNAME A20
.
CACHE.
EXECUTE.
 
* sort cases is needed to match files
 
SORT CASES BY PATIENT_ID (A).
DATASET NAME Patients WINDOW=FRONT.
 
* load visits csv data
 
GET DATA /TYPE=TXT
/FILE="visit.csv"
/ENCODING='UTF8'
/DELCASE=LINE
/DELIMITERS=","
/QUALIFIER='"'
/ARRANGEMENT=DELIMITED
/FIRSTCASE=2
/IMPORTCASE=ALL
/VARIABLES=
PATIENT_ID F5.0
VISIT_DATE SDATE10
SCORE F4.1
.
CACHE.
EXECUTE.
 
* sort cases is needed, else match files will raise error "Files out of order"
 
SORT CASES BY PATIENT_ID (A) VISIT_DATE (A).
DATASET NAME Visits WINDOW=FRONT.
 
* load visits csv data
 
* merge datasets, one to many, FILE is the 'one', TABLE is 'many'
 
MATCH FILES TABLE = Patients / FILE = Visits
/BY PATIENT_ID.
EXECUTE.
 
* aggregate visit date and scores, group by and order (A)=ascending or (D)=descending
 
AGGREGATE OUTFILE *
/BREAK=PATIENT_ID(A)
/last_visit = MAX(VISIT_DATE)
/score_avg = MEAN(SCORE)
/score_sum = SUM(SCORE).
Output:
| PATIENT_ID | LASTNAME | last_visit | score_sum | score_avg |
| 1001       |  Hopper  | 2020-11-19 |      17.4 |       5.8 |
| 2002       |  Gosling | 2020-10-08 |       6.8 |       6.8 |
| 3003       |  Kemeny  | 2020-11-12 |         . |         . |
| 4004       |  Wirth   | 2020-11-05 |      15.4 |       7.7 |

SQL[edit]

-- drop tables
DROP TABLE IF EXISTS tmp_patients;
DROP TABLE IF EXISTS tmp_visits;
 
-- create tables
CREATE TABLE tmp_patients(
PATIENT_ID INT,
LASTNAME VARCHAR(20)
);
 
CREATE TABLE tmp_visits(
PATIENT_ID INT,
VISIT_DATE DATE,
SCORE NUMERIC(4,1)
);
 
-- load data from csv files
/*
-- Note: LOAD DATA LOCAL requires `local-infile` enabled on both the client and server else you get error "#1148 command is not allowed.."
LOAD DATA LOCAL INFILE '/home/csv/patients.csv' INTO TABLE `tmp_patients` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
LOAD DATA LOCAL INFILE '/home/csv/visits.csv' INTO TABLE `tmp_visits` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
*/

 
-- load data hard coded
INSERT INTO tmp_patients(PATIENT_ID, LASTNAME)
VALUES
(1001, 'Hopper'),
(4004, 'Wirth'),
(3003, 'Kemeny'),
(2002, 'Gosling'),
(5005, 'Kurtz');
 
INSERT INTO tmp_visits(PATIENT_ID, VISIT_DATE, SCORE)
VALUES
(2002, '2020-09-10', 6.8),
(1001, '2020-09-17', 5.5),
(4004, '2020-09-24', 8.4),
(2002, '2020-10-08', NULL),
(1001, NULL, 6.6),
(3003, '2020-11-12', NULL),
(4004, '2020-11-05', 7.0),
(1001, '2020-11-19', 5.3);
 
-- join tables and group
SELECT
p.PATIENT_ID,
p.LASTNAME,
MAX(VISIT_DATE) AS LAST_VISIT,
SUM(SCORE) AS SCORE_SUM,
CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG
FROM
tmp_patients p
LEFT JOIN tmp_visits v
ON v.PATIENT_ID = p.PATIENT_ID
GROUP BY
p.PATIENT_ID,
p.LASTNAME
ORDER BY
p.PATIENT_ID;
Output:
PATIENT_ID  LASTNAME  LAST_VISIT  SCORE_SUM  SCORE_AVG
1001        Hopper    2020-11-19       17.4       5.80
2002        Gosling   2020-10-08        6.8       6.80
3003        Kemeny    2020-11-12       NULL       NULL
4004        Wirth     2020-11-05       15.4       7.70
5005        Kurtz     NULL             NULL       NULL

Wren[edit]

Library: Wren-sort
Library: Wren-fmt
import "/fmt" for Fmt
import "/sort" for Sort
 
class Patient {
construct new(id, lastName) {
_id = id
_lastName = lastName
if (!__dir) __dir = {}
__dir[id] = lastName
if (!__ids) {
__ids = [id]
} else {
__ids.add(id)
Sort.insertion(__ids)
}
}
 
id { _id }
lastName { _lastName }
 
// maps an id to a lastname
static dir { __dir }
 
// maintains a sorted list of ids
static ids { __ids }
}
 
class Visit {
construct new(id, date, score) {
_id = id
_date = date || "0000-00-00"
_score = score
if (!__dir) __dir = {}
if (!__dir[id]) {
__dir[id] = [ [_date], [score] ]
} else {
__dir[id][0].add(_date)
__dir[id][1].add(score)
}
}
 
id { _id }
date { _date }
score { _score }
 
// maps an id to lists of dates and scores
static dir { __dir }
}
 
class Merge {
construct new(id) {
_id = id
}
 
id { _id }
lastName { Patient.dir[_id] }
dates { Visit.dir[_id][0] }
scores { Visit.dir[_id][1] }
lastVisit { Sort.merge(dates)[-1] }
 
scoreSum { scores.reduce(0) { |acc, s| s ? acc + s : acc } }
scoreAvg { scoreSum / scores.count { |s| s } }
 
static print(merges) {
System.print("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
var fmt = "| $d | $-7s | $s | $4s | $4s |"
for (m in merges) {
if (Visit.dir[m.id]) {
var lv = (m.lastVisit != "0000-00-00") ? m.lastVisit : " "
var ss = (m.scoreSum > 0) ? Fmt.f(4, m.scoreSum, 1) : " "
var sa = (!m.scoreAvg.isNan) ? Fmt.f(4, m.scoreAvg, 2) : " "
Fmt.print(fmt, m.id, m.lastName, lv, ss, sa)
} else {
Fmt.print(fmt, m.id, m.lastName, " ", " ", " ")
}
}
}
}
 
Patient.new(1001, "Hopper")
Patient.new(4004, "Wirth")
Patient.new(3003, "Kemeny")
Patient.new(2002, "Gosling")
Patient.new(5005, "Kurtz")
 
Visit.new(2002, "2020-09-10", 6.8)
Visit.new(1001, "2020-09-17", 5.5)
Visit.new(4004, "2020-09-24", 8.4)
Visit.new(2002, "2020-10-08", null)
Visit.new(1001, null , 6.6)
Visit.new(3003, "2020-11-12", null)
Visit.new(4004, "2020-11-05", 7.0)
Visit.new(1001, "2020-11-19", 5.3)
 
var merges = Patient.ids.map { |id| Merge.new(id) }.toList
Merge.print(merges)
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |