Merge and aggregate datasets

From Rosetta Code
Task
Merge and aggregate datasets
You are encouraged to solve this task according to the task description, using any language you may know.

Merge and aggregate datasets


Task

Merge and aggregate two datasets as provided in   .csv   files into a new resulting dataset.

Use the appropriate methods and data structures depending on the programming language.

Use the most common libraries only when built-in functionality is not sufficient.


Note

Either load the data from the   .csv   files or create the required data structures hard-coded.


patients.csv   file contents:

PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz


visits.csv   file contents:

PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3


Create a resulting dataset in-memory or output it to screen or file, whichever is appropriate for the programming language at hand.

Merge and group per patient id and last name,   get the maximum visit date,   and get the sum and average of the scores per patient to get the resulting dataset.


Note that the visit date is purposefully provided as ISO format,   so that it could also be processed as text and sorted alphabetically to determine the maximum date.

| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 6.80      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |


Note

This task is aimed in particular at programming languages that are used in data science and data processing, such as F#, Python, R, SPSS, MATLAB etc.


Related tasks



11l[edit]

Translation of: Python: Stdlib csv only
V patients_csv =
‘PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz’

V visits_csv =
‘PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3’

F csv2list(s)
   [[String]] rows
   L(row) s.split("\n")
      rows [+]= row.split(‘,’)
   R rows

V patients = csv2list(patients_csv)
V visits = csv2list(visits_csv)

V result = copy(patients)
result.sort_range(1..)
result[0].append(‘LAST_VISIT’)
V last = Dict(visits[1..], p_vis -> (p_vis[0], p_vis[1]))
L(record) 1 .< result.len
   result[record].append(last.get(result[record][0], ‘’))
result[0] [+]= [‘SCORE_SUM’, ‘SCORE_AVG’]

V n = Dict(patients[1..], p -> (p[0], 0))
V tot = Dict(patients[1..], p -> (p[0], 0.0))

L(record) visits[1..]
   V p = record[0]
   V score = record[2]
   I !score.empty
      n[p]++
      tot[p] += Float(score)

L(record) 1 .< result.len
   V p = result[record][0]
   I n[p] != 0
      result[record] [+]= [‘#3.1’.format(tot[p]), ‘#2.2’.format(tot[p] / n[p])]
   E
      result[record] [+]= [‘’, ‘’]

L(record) result
   print(‘| ’record.map(r -> r.center(10)).join(‘ | ’)‘ |’)
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.80    |
|    2002    |  Gosling   | 2020-10-08 |     6.8    |    6.80    |
|    3003    |   Kemeny   | 2020-11-12 |            |            |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.70    |
|    5005    |   Kurtz    |            |            |            |

AutoHotkey[edit]

Merge_and_aggregate(patients, visits){
    ID := [], LAST_VISIT := [], SCORE_SUM := [], VISIT := []
    for i, line in StrSplit(patients, "`n", "`r"){
        if (i=1)
            continue
        x := StrSplit(line, ",")
        ID[x.1] := x.2
    }

    for i, line in StrSplit(visits, "`n", "`r"){
        if (i=1)
            continue
        x := StrSplit(line, ",")
        LAST_VISIT[x.1] := x.2 > LAST_VISIT[x.1] ? x.2 : LAST_VISIT[x.1]
        SCORE_SUM[x.1] := (SCORE_SUM[x.1] ? SCORE_SUM[x.1] : 0) + (x.3 ? x.3 : 0)
        if x.3
            VISIT[x.1] := (VISIT[x.1] ? VISIT[x.1] : 0) + 1
    }

    output := "PATIENT_ID`tLASTNAME`tLAST_VISIT`tSCORE_SUM`tSCORE_AVG`n"
    for id, name in ID
        output .= ID "`t" name "`t" LAST_VISIT[id] "`t" SCORE_SUM[id] "`t" SCORE_SUM[id]/VISIT[id] "`n"
    return output
}
Examples:
patients =
(
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
)

visits =
(
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
)

MsgBox % Merge_and_aggregate(patients, visits)
return
Output:
PATIENT_ID	LASTNAME	LAST_VISIT	SCORE_SUM	SCORE_AVG
1001		Hopper		2020-11-19	17.400000	5.800000
2002		Gosling		2020-10-08	6.800000	6.800000
3003		Kemeny		2020-11-12	0
4004		Wirth		2020-11-05	15.400000	7.700000
5005		Kurtz

AWK[edit]

# syntax: GAWK -f MERGE_AND_AGGREGATE_DATASETS.AWK RC-PATIENTS.CSV RC-VISITS.CSV
# files may appear in any order
#
# sorting:
#   PROCINFO["sorted_in"] is used by GAWK
#   SORTTYPE is used by Thompson Automation's TAWK
#
{ # printf("%s %s\n",FILENAME,$0) # print input
    split($0,arr,",")
    if (FNR == 1) {
      file = (arr[2] == "LASTNAME") ? "patients" : "visits"
      next
    }
    patient_id_arr[key] = key = arr[1]
    if (file == "patients") {
      lastname_arr[key] = arr[2]
    }
    else if (file == "visits") {
      if (arr[2] > visit_date_arr[key]) {
        visit_date_arr[key] = arr[2]
      }
      if (arr[3] != "") {
        score_arr[key] += arr[3]
        score_count_arr[key]++
      }
    }
}
END {
    print("")
    PROCINFO["sorted_in"] = "@ind_str_asc" ; SORTTYPE = 1
    fmt = "%-10s %-10s %-10s %9s %9s %6s\n"
    printf(fmt,"patient_id","lastname","last_visit","score_sum","score_avg","scores")
    for (i in patient_id_arr) {
      avg = (score_count_arr[i] > 0) ? score_arr[i] / score_count_arr[i] : ""
      printf(fmt,patient_id_arr[i],lastname_arr[i],visit_date_arr[i],score_arr[i],avg,score_count_arr[i]+0)
    }
    exit(0)
}
Output:
patient_id lastname   last_visit score_sum score_avg scores
1001       Hopper     2020-11-19      17.4       5.8      3
2002       Gosling    2020-10-08       6.8       6.8      1
3003       Kemeny     2020-11-12                          0
4004       Wirth      2020-11-05      15.4       7.7      2
5005       Kurtz                                          0

C++[edit]

Uses C++20

#include <iostream>
#include <optional>
#include <ranges>
#include <string>
#include <vector>

using namespace std;

struct Patient
{
    string ID;
    string LastName;
};

struct Visit
{
    string PatientID;
    string Date;
    optional<float> Score;
};

int main(void) 
{
    auto patients = vector<Patient> {
        {"1001", "Hopper"},
        {"4004", "Wirth"},
        {"3003", "Kemeny"},
        {"2002", "Gosling"},
        {"5005", "Kurtz"}};

    auto visits = vector<Visit> {    
        {"2002", "2020-09-10", 6.8},
        {"1001", "2020-09-17", 5.5},
        {"4004", "2020-09-24", 8.4},
        {"2002", "2020-10-08", },
        {"1001", ""          , 6.6},
        {"3003", "2020-11-12", },
        {"4004", "2020-11-05", 7.0},
        {"1001", "2020-11-19", 5.3}};

    // sort the patients by ID
    sort(patients.begin(), patients.end(), 
         [](const auto& a, const auto&b){ return a.ID < b.ID;});    

    cout << "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n";
    for(const auto& patient : patients)
    {
        // loop over all of the patients and determine the fields
        string lastVisit;
        float sum = 0;
        int numScores = 0;
        
        // use C++20 ranges to filter the visits by patients
        auto patientFilter = [&patient](const Visit &v){return v.PatientID == patient.ID;};
        for(const auto& visit : visits | views::filter( patientFilter ))
        {
            if(visit.Score)
            {
                sum += *visit.Score;
                numScores++;
            }
            lastVisit = max(lastVisit, visit.Date);
        }
        
        // format the output
        cout << "|       " << patient.ID << " | ";
        cout.width(8); cout << patient.LastName << " | ";
        cout.width(10); cout << lastVisit << " | ";
        if(numScores > 0)
        {
            cout.width(9); cout << sum << " | ";
            cout.width(9); cout << (sum / float(numScores));
        }
        else cout << "          |          ";
        cout << " |\n";
    }
}
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
|       1001 |   Hopper | 2020-11-19 |      17.4 |       5.8 |
|       2002 |  Gosling | 2020-10-08 |       6.8 |       6.8 |
|       3003 |   Kemeny | 2020-11-12 |           |           |
|       4004 |    Wirth | 2020-11-05 |      15.4 |       7.7 |
|       5005 |    Kurtz |            |           |           |

C#[edit]

using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Runtime.Serialization;

public static class MergeAndAggregateDatasets
{
    public static void Main()
    {
        string patientsCsv = @"
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz";

        string visitsCsv = @"
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3";

        string format = "yyyy-MM-dd";
        var formatProvider = new DateTimeFormat(format).FormatProvider;

        var patients = ParseCsv(
            patientsCsv.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries),
            line => (PatientId: int.Parse(line[0]), LastName: line[1]));

        var visits = ParseCsv(
            visitsCsv.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries),
            line => (
                PatientId: int.Parse(line[0]),
                VisitDate: DateTime.TryParse(line[1], formatProvider, DateTimeStyles.None, out var date) ? date : default(DateTime?),
                Score: double.TryParse(line[2], out double score) ? score : default(double?)
            )
        );

        var results =
            patients.GroupJoin(visits,
                p => p.PatientId,
                v => v.PatientId,
                (p, vs) => (
                    p.PatientId,
                    p.LastName,
                    LastVisit: vs.Max(v => v.VisitDate),
                    ScoreSum: vs.Sum(v => v.Score),
                    ScoreAvg: vs.Average(v => v.Score)
                )
            ).OrderBy(r => r.PatientId);

        Console.WriteLine("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |");
        foreach (var r in results) {
            Console.WriteLine($"| {r.PatientId,-10} | {r.LastName,-8} | {r.LastVisit?.ToString(format) ?? "",-10} | {r.ScoreSum,9} | {r.ScoreAvg,9} |");
        }
    }

    private static IEnumerable<T> ParseCsv<T>(string[] contents, Func<string[], T> constructor)
    {
        for (int i = 1; i < contents.Length; i++) {
            var line = contents[i].Split(',');
            yield return constructor(line);
        }
    }

}
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 |      17.4 |       5.8 |
| 2002       | Gosling  | 2020-10-08 |       6.8 |       6.8 |
| 3003       | Kemeny   | 2020-11-12 |         0 |           |
| 4004       | Wirth    | 2020-11-05 |      15.4 |       7.7 |
| 5005       | Kurtz    |            |         0 |           |

Datalog[edit]

The following implementation is for Souffle. Souffle does not currently include dates and times in the base distribution, but it is straightforward to use C timestamps:

// datetime.cpp
#include <ctime>
#include <cstdint>
extern "C" {
  int64_t from date(const char* string) {
    struct tm tmInfo = {0};
    strptime(string, "%Y-%m-%d", &tmInfo);
    return mktime(&tmInfo); // localtime
  }
}

Rather than combine the summaries, the date and score summaries have been presented separately.

/* rosetta.dl */
#define NaN to_float("NaN")
.functor from_date(date:symbol) : number
.decl Patient(id:number, lastname:symbol)
.decl Visit(id:number, date:symbol, score:float)
.decl SummaryDates(id:number, lastname:symbol, last_date:symbol)
.decl SummaryScores(id:number, lastname:symbol, score_sum:float, score_mean:float)
.decl MissingDates(x:number)
 
Patient(1001,"Hopper").
Patient(4004,"Wirth").
Patient(3003,"Kemeny").
Patient(2002,"Gosling").
Patient(5005,"Kurtz").

Visit(2002,"2020-09-10",6.8).
Visit(1001,"2020-09-17",5.5).
Visit(4004,"2020-09-24",8.4).
Visit(2002,"2020-10-08",NaN).
Visit(1001,"",6.6).
Visit(3003,"2020-11-12",NaN).
Visit(4004,"2020-11-05",7.0).
Visit(1001,"2020-11-19",5.3).

MissingDates(@from_date("")) :- true.

SummaryDates(id, lastname, last_date) :- 
     Patient(id,lastname),
     last_timestamp = max ts: {Visit(id, date, _), ts = @from_date(date), !MissingDates(ts)},
     Visit(id, last_date, _), last_timestamp = @from_date(last_date). 
SummaryScores(id, lastname, score_sum, score_mean) :- 
     Patient(id,lastname),
     score_sum = sum score: {Visit(id, _, score), score != NaN},
     score_mean = mean score: {Visit(id, _, score), score != NaN}.
.output SummaryDates
.output SummaryScores

Then this is called using:

g++ -shared -fPIC datetime.cpp -o libfunctors.so
souffle -D- rosetta.dl
Output:
---------------
SummaryScores
id	lastname	score_sum	score_mean
===============
1001	Hopper	17.399999999999999	5.7999999999999998
2002	Gosling	6.7999999999999998	6.7999999999999998
4004	Wirth	15.4	7.7000000000000002
===============
---------------
SummaryDates
id	lastname	last_date
===============
1001	Hopper	2020-11-19
2002	Gosling	2020-10-08
3003	Kemeny	2020-11-12
4004	Wirth	2020-11-05
===============

F#[edit]

Note that the scores are right justified to copy the task description. It would be more natural to leave them right justified.

// Merge and aggregate datasets. Nigel Galloway: January 6th., 2021
let rFile(fName)=seq{use n=System.IO.File.OpenText(fName)
                     n.ReadLine() |> ignore
                     while not n.EndOfStream do yield n.ReadLine().Split [|','|]}
let N=rFile("file1.txt") |> Seq.sort
let G=rFile("file2.txt") |> Seq.groupBy(fun n->n.[0]) |> Map.ofSeq
let fN n i g e l=printfn "| %-10s | %-8s | %10s |  %-9s | %-9s |" n i g e l 
let fG n g=let z=G.[n]|>Seq.sumBy(fun n->try float n.[2] with :? System.FormatException->0.0)
           fN n g (G.[n]|>Seq.sort|>Seq.last).[1] (if z=0.0 then "" else string z) (if z=0.0 then "" else string(z/(float(Seq.length G.[n]))))
Output:
printfn "| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |"
N|>Seq.iter(fun n->match G.ContainsKey n.[0] with true->fG n.[0] n.[1] |_->fN n.[0] n.[1] "" "" "")

| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 |  17.4      | 5.8       |
| 2002       | Gosling  | 2020-10-08 |  6.8       | 3.4       |
| 3003       | Kemeny   | 2020-11-12 |            |           |
| 4004       | Wirth    | 2020-11-05 |  15.4      | 7.7       |
| 5005       | Kurtz    |            |            |           |

Go[edit]

Translation of: Wren
package main

import (
    "fmt"
    "math"
    "sort"
)

type Patient struct {
    id       int
    lastName string
}

// maps an id to a lastname
var patientDir = make(map[int]string)

// maintains a sorted list of ids
var patientIds []int

func patientNew(id int, lastName string) Patient {
    patientDir[id] = lastName
    patientIds = append(patientIds, id)
    sort.Ints(patientIds)
    return Patient{id, lastName}
}

type DS struct {
    dates  []string
    scores []float64
}

type Visit struct {
    id    int
    date  string
    score float64
}

// maps an id to lists of dates and scores
var visitDir = make(map[int]DS)

func visitNew(id int, date string, score float64) Visit {
    if date == "" {
        date = "0000-00-00"
    }
    v, ok := visitDir[id]
    if ok {
        v.dates = append(v.dates, date)
        v.scores = append(v.scores, score)
        visitDir[id] = DS{v.dates, v.scores}
    } else {
        visitDir[id] = DS{[]string{date}, []float64{score}}
    }
    return Visit{id, date, score}
}

type Merge struct{ id int }

func (m Merge) lastName() string  { return patientDir[m.id] }
func (m Merge) dates() []string   { return visitDir[m.id].dates }
func (m Merge) scores() []float64 { return visitDir[m.id].scores }

func (m Merge) lastVisit() string {
    dates := m.dates()
    dates2 := make([]string, len(dates))
    copy(dates2, dates)
    sort.Strings(dates2)
    return dates2[len(dates2)-1]
}

func (m Merge) scoreSum() float64 {
    sum := 0.0
    for _, score := range m.scores() {
        if score != -1 {
            sum += score
        }
    }
    return sum
}

func (m Merge) scoreAvg() float64 {
    count := 0
    for _, score := range m.scores() {
        if score != -1 {
            count++
        }
    }
    return m.scoreSum() / float64(count)
}

func mergePrint(merges []Merge) {
    fmt.Println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
    f := "| %d       | %-7s  | %s | %4s      | %4s      |\n"
    for _, m := range merges {
        _, ok := visitDir[m.id]
        if ok {
            lv := m.lastVisit()
            if lv == "0000-00-00" {
                lv = "          "
            }
            scoreSum := m.scoreSum()
            ss := fmt.Sprintf("%4.1f", scoreSum)
            if scoreSum == 0 {
                ss = "    "
            }
            scoreAvg := m.scoreAvg()
            sa := "    "
            if !math.IsNaN(scoreAvg) {
                sa = fmt.Sprintf("%4.2f", scoreAvg)
            }
            fmt.Printf(f, m.id, m.lastName(), lv, ss, sa)
        } else {
            fmt.Printf(f, m.id, m.lastName(), "          ", "    ", "    ")
        }
    }
}

func main() {
    patientNew(1001, "Hopper")
    patientNew(4004, "Wirth")
    patientNew(3003, "Kemeny")
    patientNew(2002, "Gosling")
    patientNew(5005, "Kurtz")

    visitNew(2002, "2020-09-10", 6.8)
    visitNew(1001, "2020-09-17", 5.5)
    visitNew(4004, "2020-09-24", 8.4)
    visitNew(2002, "2020-10-08", -1) // -1 signifies no score
    visitNew(1001, "", 6.6)          // "" signifies no date
    visitNew(3003, "2020-11-12", -1)
    visitNew(4004, "2020-11-05", 7.0)
    visitNew(1001, "2020-11-19", 5.3)

    merges := make([]Merge, len(patientIds))
    for i, id := range patientIds {
        merges[i] = Merge{id}
    }
    mergePrint(merges)
}
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |

Haskell[edit]

Reading and merging[edit]

Merging of fields and databases is defined as a monoid operation for corresponding types.

import Data.List
import Data.Maybe
import System.IO (readFile)
import Text.Read (readMaybe)
import Control.Applicative ((<|>))

------------------------------------------------------------

newtype DB = DB { entries :: [Patient] }
  deriving Show

instance Semigroup DB where
  DB a <> DB b = normalize $ a <> b

instance Monoid DB where
  mempty = DB []

normalize :: [Patient] -> DB
normalize = DB
            . map mconcat 
            . groupBy (\x y -> pid x == pid y)
            . sortOn pid
 
------------------------------------------------------------

data Patient = Patient { pid :: String
                       , name :: Maybe String
                       , visits :: [String]
                       , scores :: [Float] }
  deriving Show

instance Semigroup Patient where
  Patient p1 n1 v1 s1 <> Patient p2 n2 v2 s2 =
    Patient (fromJust $ Just p1 <|> Just p2)
            (n1 <|> n2)
            (v1 <|> v2)
            (s1 <|> s2)

instance Monoid Patient where
  mempty = Patient mempty mempty mempty mempty
    
------------------------------------------------------------

readDB :: String  -> DB
readDB = normalize
         . mapMaybe readPatient
         . readCSV

readPatient r = do
  i <- lookup "PATIENT_ID" r
  let n = lookup "LASTNAME" r
  let d = lookup "VISIT_DATE" r >>= readDate
  let s = lookup "SCORE" r >>= readMaybe
  return $ Patient i n (maybeToList d) (maybeToList s)
  where
    readDate [] = Nothing
    readDate d = Just d

readCSV :: String -> [(String, String)]
readCSV txt = zip header <$> body
  where
    header:body = splitBy ',' <$> lines txt
    splitBy ch = unfoldr go
      where
        go [] = Nothing
        go s  = Just $ drop 1 <$> span (/= ch) s
let patients = readDB <$> readFile "patients.csv"
*Main> let visits = readDB <$> readFile "visits.csv" 

*Main> mapM_ print . entries =<< patients
Patient {pid = "1001", name = Just "Hopper", visits = [], scores = []}
Patient {pid = "2002", name = Just "Gosling", visits = [], scores = []}
Patient {pid = "3003", name = Just "Kemeny", visits = [], scores = []}
Patient {pid = "4004", name = Just "Wirth", visits = [], scores = []}
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}

*Main> mapM_ print . entries =<< visits
Patient {pid = "1001", name = Nothing, visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
Patient {pid = "2002", name = Nothing, visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
Patient {pid = "3003", name = Nothing, visits = ["2020-11-12"], scores = []}
Patient {pid = "4004", name = Nothing, visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}

*Main> mapM_ print . entries =<< patients <> visits
Patient {pid = "1001", name = Just "Hopper", visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
Patient {pid = "2002", name = Just "Gosling", visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
Patient {pid = "3003", name = Just "Kemeny", visits = ["2020-11-12"], scores = []}
Patient {pid = "4004", name = Just "Wirth", visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}

Pretty tabulation[edit]

tabulateDB (DB ps) header cols = intercalate "|" <$> body
  where
    body = transpose $ zipWith pad width table
    table = transpose $ header : map showPatient ps
    showPatient p = sequence cols p
    width = maximum . map length <$> table
    pad n col = (' ' :) . take (n+1) . (++ repeat ' ') <$> col

main = do
  a <- readDB <$> readFile "patients.csv"
  b <- readDB <$> readFile "visits.csv"
  mapM_ putStrLn $ tabulateDB (a <> b) header fields
  where
    header = [ "PATIENT_ID", "LASTNAME", "VISIT_DATE"
             , "SCORES SUM","SCORES AVG"]
    fields = [ pid
             , fromMaybe [] . name
             , \p -> case visits p of {[] -> []; l -> last l}
             , \p -> case scores p of {[] -> []; s -> show (sum s)}
             , \p -> case scores p of {[] -> []; s -> show (mean s)} ]

    mean lst = sum lst / genericLength lst
*Main> main
 PATIENT_ID | LASTNAME | VISIT_DATE | SCORES SUM | SCORES AVG 
 1001       | Hopper   | 2020-11-19 | 17.4       | 5.7999997  
 2002       | Gosling  | 2020-10-08 | 6.8        | 6.8        
 3003       | Kemeny   | 2020-11-12 |            |            
 4004       | Wirth    | 2020-11-05 | 15.4       | 7.7        
 5005       | Kurtz    |            |            |            

J[edit]

One approach here would be to use Jd

In other words, we can set things up like this:

NB. setup:
require'jd pacman'
load JDP,'tools/csv_load.ijs'
F=: jpath '~temp/rosettacode/example/CSV'
jdcreatefolder_jd_ CSVFOLDER=: F

assert 0<{{)n
PATIENTID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
}} fwrite F,'patients.csv'

assert 0<{{)n
PATIENTID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
}} fwrite F,'visits.csv'

csvprepare 'patients';F,'patients.csv'
csvprepare 'visits';F,'visits.csv'

csvload 'patients';1
csvload 'visits';1

jd'ref patients PATIENTID  visits PATIENTID'

And, then we can run our query:

require'jd'

echo jd {{)n
  reads
        PATIENT_ID: first p.PATIENTID,
        LASTNAME:   first p.LASTNAME,
        LAST_VISIT: max v.VISIT_DATE,
        SCORE_SUM:  sum v.SCORE,
        SCORE_AVG:  avg v.SCORE
    by
      p.PATIENTID
    from
      p:patients,
      v:p.visits
}} -.LF

Which displays this result:

┌───────────┬──────────┬────────┬──────────┬─────────┬─────────┐
│p.PATIENTID│PATIENT_ID│LASTNAME│LAST_VISIT│SCORE_SUM│SCORE_AVG│
├───────────┼──────────┼────────┼──────────┼─────────┼─────────┤
│1001       │1001      │Hopper  │2020-09-17│5.5      │5.5      │
│4004       │4004      │Wirth   │2020-09-24│8.4      │8.4      │
│3003       │3003      │Kemeny  │2020-11-12│ __      │ __      │
│2002       │2002      │Gosling │2020-09-10│6.8      │6.8      │
│5005       │5005      │Kurtz   │?         │  0      │  0      │
└───────────┴──────────┴────────┴──────────┴─────────┴─────────┘

Another approach would be to use J's csv library:

require'csv'
patients=: fixcsv {{)n
PATIENTID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
}}

visits=: fixcsv {{)n
PATIENTID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
}}

task=: {{
  P=. <@:>"1|:/:~}.patients
  V=. <@:>"1|:/:~}.visits
  id=. 0 {:: P
  nm=. 1 {:: P
  sel1=. (0 {:: P) e. 0 {:: V
  sel2=. (~.0 {:: V) e. 0 {:: P NB. unnecessary for this example
  exp=. sel1 #inv sel2 # ]
  agg=. /.(&.:".)
  vdt=. exp     (0 {:: V) {:/.         1 {:: V
  sum=. exp ":,.(0 {:: V) +//.     0". 2 {:: V
  avg=. exp ":,.(0 {:: V) (+/%#)/. 0". 2 {:: V
  labels=. ;:'PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG'
  labels,:id;nm;vdt;sum;avg
}}

Here:

   task''
┌──────────┬────────┬──────────┬─────────┬─────────┐
PATIENT_IDLASTNAMELAST_VISITSCORE_SUMSCORE_AVG
├──────────┼────────┼──────────┼─────────┼─────────┤
1001      Hopper  2020-11-1917.4     5.8      
2002      Gosling 2020-10-08 6.8     3.4      
3003      Kemeny  2020-11-12   0       0      
4004      Wirth   2020-11-0515.4     7.7      
5005      Kurtz                               
└──────────┴────────┴──────────┴─────────┴─────────┘

If the empty score in visits was a display concern, we might instead do it this way:

task=: {{
  P=. <@:>"1|:/:~}.patients
  V=. <@:>"1|:/:~}.visits
  id=. 0 {:: P
  nm=. 1 {:: P
  sel1=. (0 {:: P) e. 0 {:: V
  sel2=. (~.0 {:: V) e. 0 {:: P NB. unnecessary for this example
  exp=. sel1 #inv sel2 # ]
  agg=. /.(&.:".)
  vdt=. exp     (0 {:: V) {:/.          1 {:: V
  sel3=. (0 {:: V) +.//. 2 *@#@{::"1 }.visits
  exp2=: [:exp sel3 #inv sel3 #]
  sum=. exp2 ":,.(0 {:: V) +//.     0". 2 {:: V
  avg=. exp2 ":,.(0 {:: V) (+/%#)/. 0". 2 {:: V
  labels=. ;:'PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG'
  labels,:id;nm;vdt;sum;avg
}}

Which gives us:

   task''
┌──────────┬────────┬──────────┬─────────┬─────────┐
PATIENT_IDLASTNAMELAST_VISITSCORE_SUMSCORE_AVG
├──────────┼────────┼──────────┼─────────┼─────────┤
1001      Hopper  2020-11-1917.4     5.8      
2002      Gosling 2020-10-08 6.8     3.4      
3003      Kemeny  2020-11-12                  
4004      Wirth   2020-11-0515.4     7.7      
5005      Kurtz                               
└──────────┴────────┴──────────┴─────────┴─────────┘

jq[edit]

Works with: jq

Works with gojq, the Go implementation of jq

In the context of jq, a relational dataset "table" is naturally represented as an array of JSON objects, each representing a row in the table. When displaying such a table, we will only show the constituent rows. Null values will be represented by JSON's `null`.

Ingesting CSV data

# objectify/1 takes an array of atomic values as inputs, and packages
# these into an object with keys specified by the "headers" array and
# values obtained by trimming string values, replacing empty strings
# by null, and converting strings to numbers if possible.
def objectify(headers):
  def tonumberq: tonumber? // .;
  def trimq: if type == "string" then sub("^ +";"") | sub(" +$";"") else . end;
  def tonullq: if . == "" then null else . end;
  . as $in
  | reduce range(0; headers|length) as $i
      ({}; .[headers[$i]] = ($in[$i] | trimq | tonumberq | tonullq) );

def csv2jsonHelper:
  .[0] as $headers
  | reduce (.[1:][] | select(length > 0) ) as $row
      ([]; . + [ $row|objectify($headers) ]);

Aggregation functions

# output {LAST_VISIT}
def LAST_VISIT($patient_id):
  {LAST_VISIT: (map(select( .PATIENT_ID == $patient_id).VISIT_DATE) | max)};

# output {SCORE_SUM, SCORE_AVG}
def SCORE_SUMMARY($patient_id):
  map(select( .PATIENT_ID == $patient_id).SCORE)
  | {SCORE_SUM: add, count: length}
  | {SCORE_SUM, SCORE_AVG: (if .SCORE_SUM and .count > 0 then .SCORE_SUM/.count else null end)};
The task
# Read the two tables:
INDEX($patients | [splits("\n")] | map(split(",")) | csv2jsonHelper[]; .PATIENT_ID) as $patients
| ($visits | [splits("\n")] | map(split(",")) | csv2jsonHelper) as $visits
# Construct the new table:
| $visits
| map(.PATIENT_ID as $PATIENT_ID
      | {$PATIENT_ID}  + 
        ($visits | {LASTNAME: $patients[$PATIENT_ID|tostring]} + LAST_VISIT($PATIENT_ID) + SCORE_SUMMARY($PATIENT_ID)))
# ... but display it as a sequence of JSON objects
| .[]

Invocation

   jq -Rnc --rawfile patients patients.csv --rawfile visits visits.csv -f program.jq
Output:
{"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4}
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8}
{"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7}
{"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4}
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8}
{"PATIENT_ID":3003,"LASTNAME":{"PATIENT_ID":3003,"LASTNAME":"Kemeny"},"LAST_VISIT":"2020-11-12","SCORE_SUM":null,"SCORE_AVG":null}
{"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7}
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8}

Julia[edit]

Translation of: Python
using CSV, DataFrames, Statistics

# load data from csv files
#df_patients = CSV.read("patients.csv", DataFrame)
#df_visits = CSV.read("visits.csv", DataFrame)

# create DataFrames from text that is hard coded, so use IOBuffer(String) as input
str_patients = IOBuffer("""PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
""")
df_patients = CSV.read(str_patients, DataFrame)
str_visits = IOBuffer("""PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
""")
df_visits = CSV.read(str_visits, DataFrame)

# merge on PATIENT_ID, using an :outer join or we lose Kurtz, who has no data, sort by ID
df_merge = sort(join(df_patients, df_visits, on="PATIENT_ID", kind=:outer), (:PATIENT_ID,))

fnonmissing(a, f) = isempty(a) ? [] : isempty(skipmissing(a)) ? a[1] : f(skipmissing(a))

# group by patient id / last name and then aggregate to get latest visit and mean score
df_result = by(df_merge, [:PATIENT_ID, :LASTNAME]) do df
    DataFrame(LATEST_VISIT = fnonmissing(df[:VISIT_DATE], maximum),
              SUM_SCORE = fnonmissing(df[:SCORE], sum),
              MEAN_SCORE = fnonmissing(df[:SCORE], mean))
end
println(df_result)
Output:
5×5 DataFrame
│ Row │ PATIENT_ID │ LASTNAME │ LATEST_VISIT │ SUM_SCORE │ MEAN_SCORE │
│     │ Int64?     │ String?  │ Dates.Date?  │ Float64?  │ Float64?   │
├─────┼────────────┼──────────┼──────────────┼───────────┼────────────┤
│ 1   │ 1001       │ Hopper   │ 2020-11-19   │ 17.4      │ 5.8        │
│ 2   │ 2002       │ Gosling  │ 2020-10-08   │ 6.8       │ 6.8        │
│ 3   │ 3003       │ Kemeny   │ 2020-11-12   │ missing   │ missing    │
│ 4   │ 4004       │ Wirth    │ 2020-11-05   │ 15.4      │ 7.7        │
│ 5   │ 5005       │ Kurtz    │ missing      │ missing   │ missing    │

Mathematica/Wolfram Language[edit]

a = ImportString["PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz", "CSV"];
b = ImportString["PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3", "CSV"];
a = <|a[[1, 1]] -> #1, a[[1, 2]] -> #2|> & @@@ Rest[a];
b = <|b[[1, 1]] -> #1, b[[1, 2]] -> If[#2 != "", DateObject[#2], Missing[]], b[[1, 3]] -> If[#3 =!= "", #3, Missing[]]|> & @@@ Rest[b];
j = JoinAcross[a, b, Key["PATIENT_ID"], "Outer"];
gr = GroupBy[j, #["PATIENT_ID"] &];
<|"PATIENT_ID" -> #[[1, "PATIENT_ID"]], 
    "LASTNAME" -> #[[1, "LASTNAME"]], 
    "VISIT_DATE" -> If[DeleteMissing[#[[All, "VISIT_DATE"]]] =!= {}, Max@DeleteMissing[#[[All, "VISIT_DATE"]]], Missing[]], 
    "SCORE_SUM" -> If[DeleteMissing@#[[All, "SCORE"]] =!= {}, Total@DeleteMissing@#[[All, "SCORE"]], Missing[]], 
    "SCORE_AVG" -> If[DeleteMissing@#[[All, "SCORE"]] =!= {}, Mean@DeleteMissing@#[[All, "SCORE"]], Missing[]]|> & /@ 
  gr // Dataset
Output:
	PATIENT_ID	LASTNAME	VISIT_DATE	SCORE_SUM	SCORE_AVG
2002	2002	Gosling	Thu 8 Oct 2020	6.8	6.8
1001	1001	Hopper	Thu 19 Nov 2020	17.4	5.8
4004	4004	Wirth	Thu 5 Nov 2020	15.4	7.7
3003	3003	Kemeny	Thu 12 Nov 2020	\[LongDash]	\[LongDash]
5005	5005	Kurtz	\[LongDash]	\[LongDash]	\[LongDash]

Mercury[edit]

:- module rosetta.

:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.

:- implementation.
:- import_module list, int, float, string, maybe, solutions.

:- pred patient(int::out, string::out) is multi.
patient(1001, "Hopper").
patient(4004, "Wirth").
patient(3003, "Kemeny").
patient(2002, "Gosling").
patient(5005, "Kurtz").

:- func nan = float.
nan = det_to_float("NaN").

:- type maybe_date ---> date(year::int, month::int, day::int); no.

:- pred visit(int::out, maybe_date::out, float::out) is multi.
visit(2002, date(2020,09,10), 6.8).
visit(1001, date(2020,09,17), 5.5).
visit(4004, date(2020,09,24), 8.4).
visit(2002, date(2020,10,08), nan).
visit(1001, no, 6.6).
visit(3003, date(2020,11,12), nan).
visit(4004, date(2020,11,05), 7.0).
visit(1001, date(2020,11,19), 5.3).

%% Utilities
:- pred bag_count(pred(T)::(pred(out) is nondet), int::out) is det.
:- pred bag_sum(pred(float)::(pred(out) is nondet), float::out) is det.
:- pred bag_avg(pred(float)::(pred(out) is nondet), float::out) is det.
:- pred bag_max(pred(T)::(pred(out) is nondet), T::in, T::out) is det.
bag_count(Predicate, Count) :-
    promise_equivalent_solutions[Count] (
	unsorted_aggregate(Predicate,
			   (pred(_X::in,Y::in,Z::out) is det :- Z is Y+1),
			   0, Count)).
bag_sum(Predicate, Sum) :-
    promise_equivalent_solutions[Sum] (
	unsorted_aggregate(Predicate,
			   (pred(X::in,Y::in,Z::out) is det :- Z is X+Y),
			   0.0, Sum)).
bag_avg(Predicate, Avg) :-
    bag_count(Predicate, N),
    bag_sum(Predicate, Sum),
    (if N = 0 then Avg = nan else Avg is Sum/float(N)).
bag_max(Predicate, Initial, Max) :-
    promise_equivalent_solutions [Max] (
	unsorted_aggregate(Predicate,
			   (pred(X::in,Y::in,Z::out) is det :-
				compare(R,X,Y),
				(if R = (>) then Z = X else Z = Y)),
			   Initial, Max)).

main(!IO) :-
    io.write_string("{Id, Lastname, SumScores, AvgScores, MaxDate}:\n", !IO),
    solutions((pred({Id,Lastname,Sum,Avg,MaxDate}::out) is nondet :-
	           patient(Id,Lastname),
	           Scores = (pred(Score::out) is nondet :- visit(Id,_,Score), \+is_nan(Score)),
	           bag_avg(Scores, Avg),
	  	   bag_sum(Scores, Sum),
	           Dates = (pred(Date::out) is nondet :- visit(Id,Date,_), Date=date(_,_,_)),
	           bag_max(Dates, date(0,0,0), MaxDate1),
		   (if MaxDate1 = date(0,0,0) then MaxDate = no else MaxDate = MaxDate1)),
	      Solutions),
    foldl(io.write_line, Solutions, !IO).
Output:
{Id, Lastname, SumScores, AvgScores, MaxDate}:
{1001, "Hopper", 17.4, 5.8, date(2020, 11, 19)}
{2002, "Gosling", 6.8, 6.8, date(2020, 10, 8)}
{3003, "Kemeny", 0.0, nan, date(2020, 11, 12)}
{4004, "Wirth", 15.4, 7.7, date(2020, 11, 5)}
{5005, "Kurtz", 0.0, nan, no}

Nim[edit]

CSV files and tables[edit]

import algorithm, parsecsv, strformat, strutils, tables

const NoValue = -1.0

type

  Names = OrderedTable[Positive, string]      # Mapping id -> last name.

  Visit = tuple[date: string; score: float]
  Visits = Table[Positive, seq[Visit]]        # Mapping id -> list of visits.


proc readNames(path: string): Names =
  ## Read the records (id, lastname) from the CSV file and fill a Names table.
  var parser: CsvParser
  parser.open(path)
  parser.readHeaderRow()
  while parser.readRow():
    let id = parser.row[0].parseInt
    let name = parser.row[1]
    result[id] = name

proc readVisits(path: string): Visits =
  ## Read the records (id, date, score) from the CSV file and fill a Visits table.
  var parser: CsvParser
  parser.open(path)
  parser.readHeaderRow()
  while parser.readRow():
    let id = parser.row[0].parseInt
    let date = parser.row[1]
    let score = if parser.row[2].len == 0: NoValue else: parser.row[2].parseFloat
    result.mgetOrPut(id, @[]).add (date, score)


var
  names = readNames("patients1.csv")
  visits = readVisits("patients2.csv")

names.sort(system.cmp)

echo "| PATIENT_ID |  LASTNAME  | LAST_VISIT |  SCORE_SUM | SCORE_AVG |"
for (id, name) in names.pairs:
  let visitList = visits.getOrDefault(id).sorted()
  let lastVisit = if visitList.len == 0: "" else: visitList[^1].date
  var sum = 0.0
  var count = 0
  for visit in visitList:
    if visit.score != NoValue:
      sum += visit.score
      inc count
  let scoreSum = if count == 0: "" else: &"{sum:>4.1f}"
  let scoreAvg = if count == 0: "" else: &"{sum / count.toFloat: >4.2f}"
  echo &"| {id:^10} | {name:^10} | {lastVisit:^10} | {scoreSum:>7}    | {scoreAvg:>6}    |"
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |   5.80    |
|    2002    |  Gosling   | 2020-10-08 |     6.8    |   6.80    |
|    3003    |   Kemeny   | 2020-11-12 |            |           |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |   7.70    |
|    5005    |   Kurtz    |            |            |           |

CSV files and SQLite3[edit]

Translation of: Python

We use the high level standard library "db_sqlite" rather than the low level one "sqlite3".

import parseCsv, db_sqlite, sequtils, strutils

const FNames = ["patients1.csv",  "patients2.csv"]

proc createTableHeaders(conn: DbConn) =
    conn.exec(sql"CREATE TABLE names(PATIENT_ID INT, LASTNAME TEXT);")
    conn.exec(sql"CREATE TABLE visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));")


proc fillTables(dbc: DbConn) =
  for idx, fname in FNames:
    dbc.exec(sql"BEGIN")
    var parser: CsvParser
    parser.open(fname)
    parser.readHeaderRow()
    while parser.readRow():
      if idx == 0:  # "names" table.
        dbc.exec(sql"INSERT INTO names VALUES (?, ?);", parser.row)
      else:         # "visits" table
        dbc.exec(sql"INSERT INTO visits VALUES (?, ?, ?);", parser.row)
    dbc.exec(sql"COMMIT")


proc joinTablesAndGroup(dbc: DbConn): seq[Row] =
  dbc.exec(sql"""CREATE TABLE answer AS
                   SELECT
                     names.PATIENT_ID,
                     names.LASTNAME,
                     MAX(VISIT_DATE) AS LAST_VISIT,
                     SUM(SCORE) AS SCORE_SUM,
                     CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG
                   FROM
                     names
                     LEFT JOIN visits
                       ON visits.PATIENT_ID = names.PATIENT_ID
                   GROUP BY
                     names.PATIENT_ID,
                     names.LASTNAME
                   ORDER BY
                     names.PATIENT_ID;""")
  result = dbc.getAllRows(sql"SELECT * FROM ANSWER")

# Build the database and execute the request to get the result.
let dbc = open(":memory:", "", "", "")
dbc.createTableHeaders()
dbc.fillTables()
let result = dbc.joinTablesAndGroup()
dbc.close()

# Print the result.
echo "| PATIENT_ID |  LASTNAME  | LAST_VISIT |  SCORE_SUM | SCORE_AVG |"
for row in result:
  echo "| " & row.mapit(it.center(10)).join(" | ") & '|'
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.8    |
|    2002    |  Gosling   | 2020-10-08 |    6.8     |    3.4    |
|    3003    |   Kemeny   | 2020-11-12 |    0.0     |     0     |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.7    |
|    5005    |   Kurtz    |            |            |           |

Phix[edit]

constant patients_txt = split("""
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz""",'\n'),
         visits_txt = split("""
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3""",'\n')
--or, assuming these files contain exactly the same actual raw text,
--    and the use of GT_LF_STRIPPED is instead of above split('\n'):
--constant patients_txt = get_text("patients.csv",GT_LF_STRIPPED),
--         visits_txt   = get_text("visits.csv",GT_LF_STRIPPED)
 
function sap(string s) return split(s,',',false)&{"",0,0} end function
function svp(string s) return split(s,',',false) end function
sequence patient_data = sort(apply(patients_txt[2..$],sap)),
         visit_data = sort_columns(apply(visits_txt[2..$],svp),{1,-2})
visit_data = append(visit_data,{"","","0"}) -- (add a sentinel)
string last_id = "",id,name,dt,scstr,avstr
atom score,score_total,average
integer visit_count = 0, pdx = 1
for i=1 to length(visit_data) do
    {id,dt,scstr} = visit_data[i]
    score = iff(scstr=""?0:scanf(scstr,"%f")[1][1])
    if id!=last_id then
        if visit_count then
            average = score_total/visit_count
            patient_data[pdx][4..5] = {score_total,average}
        end if
        if i=length(visit_data) then exit end if -- (sentinel)
        score_total = score
        visit_count = (score!=0)
        while id!=patient_data[pdx][1] do pdx += 1 end while
        patient_data[pdx][3] = dt
        last_id = id
    elsif score!=0 then
        score_total += score
        visit_count += 1
    end if
end for
printf(1,"| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |\n")
for i=1 to length(patient_data) do
    {id,name,dt,score,average} = patient_data[i]
    scstr = iff(score=0?"":sprintf("%4.1f",score))
    avstr = iff(average=0?"":sprintf("%4.2f",average))
    printf(1,"| %-10s |  %-7s | %10s |  %-9s | %-9s |\n",
            {id,name,dt,scstr,avstr})
end for
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 6.80      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |

Perl[edit]

Weird requirement: "Use the most common libraries only when built-in functionality is not sufficient." Not even a "use strict;" :)

#!/usr/bin/perl
my $fmt = '| %-11s' x 5 . "|\n";
printf $fmt, qw( PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG);
my ($names, $visits) = do { local $/; split /^\n/m, <DATA> };
my %score;
for ( $visits =~ /^\d.*/gm )
  {
  my ($id, undef, $score) = split /,/;
  $score{$id} //= ['', ''];
  $score and $score{$id}[0]++, $score{$id}[1] += $score;
  }
for ( sort $names =~ /^\d.*/gm )
  {
  my ($id, $name) = split /,/;
  printf $fmt, $id, $name, ( sort $visits =~ /^$id,(.*?),/gm, '' )[-1],
    $score{$id}[0]
      ? ( $score{$id}[1], $score{$id}[1] / $score{$id}[0])
      : ('', '');
  }

__DATA__
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz

PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
Output:
| PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
| 1001       | Hopper     | 2020-11-19 | 17.4       | 5.8        |
| 2002       | Gosling    | 2020-10-08 | 6.8        | 6.8        |
| 3003       | Kemeny     | 2020-11-12 |            |            |
| 4004       | Wirth      | 2020-11-05 | 15.4       | 7.7        |
| 5005       | Kurtz      |            |            |            |

Prolog[edit]

Implemented using SWI Prolog:

patient(1001,'Hopper').
patient(4004,'Wirth').
patient(3003,'Kemeny').
patient(2002,'Gosling').
patient(5005,'Kurtz').
 
visit(2002,'2020-09-10',6.8).
visit(1001,'2020-09-17',5.5).
visit(4004,'2020-09-24',8.4).
visit(2002,'2020-10-08',nan).
visit(1001,'',6.6).
visit(3003,'2020-11-12',nan).
visit(4004,'2020-11-05',7.0).
visit(1001,'2020-11-19',5.3).

summaryDates(Id, Lastname, LastDate) :- 
     aggregate(max(Ts),
	       Score^Date^(visit(Id, Date, Score), Date \= '', parse_time(Date, iso_8601, Ts)),
	       MaxTs),
     format_time(atom(LastDate), '%Y-%m-%d', MaxTs),
     patient(Id,Lastname).

summaryScores(Id, Lastname, Sum, Mean) :- 
     aggregate(r(sum(Score),count), Date^(visit(Id, Date, Score), Score \= nan), r(Sum,Count)), 
     patient(Id,Lastname),
     Mean is Sum/Count.

test :-
    summaryDates(Id, Lastname, LastDate),
    writeln(summaryDates(Id, Lastname, LastDate)),
    fail.

test :-
    summaryScores(Id, Lastname, ScoreSum, ScoreMean),
    writeln(summaryScores(Id, Lastname, ScoreSum, ScoreMean)),
    fail.
Output:
summaryDates(1001,Hopper,2020-11-19)
summaryDates(2002,Gosling,2020-10-08)
summaryDates(3003,Kemeny,2020-11-12)
summaryDates(4004,Wirth,2020-11-05)
summaryScores(1001,Hopper,17.4,5.8)
summaryScores(2002,Gosling,6.8,6.8)
summaryScores(4004,Wirth,15.4,7.7)
false.

Implemented using XSB Prolog (which allows for user-defined aggregates):

:- import bagMax/2, bagCount/2, bagSum/2, bagReduce/4 from aggregs.
:- import julian_date/7, date_string/3 from iso8601.
:- import load_csv/2, add_cvt_type_hook/2 from proc_files.

?- add_cvt_type_hook(date,date_converter(_,_)).

date_converter(Atom,Date) :- date_string('YYYY-MM-DD',Date,Atom).

:- load_csv('visit.csv',visit(integer,date,float)).
:- load_csv('patient.csv',patient(integer,atom)).

is_nan(Number) :- X is Number, X =\= Number.

summaryDates(Id, Lastname, LastDate) :-
    bagMax(date_number(Id), LastDateNumber),
    patient(Id,Lastname),
    julian_date(LastDateNumber, Y, M, D, _, _, _),
    date_converter(LastDate, date(Y,M,D)).

summaryScores(Id, Lastname, Sum, Mean) :- 
    bagSum(scores(Id), Sum),
    bagCount(scores(Id), Count),
    Mean is Sum/Count,
    patient(Id,Lastname).

test :-
    summaryDates(Id,Lastname,LastDate),
    writeln(summaryDates(Id,Lastname,LastDate)), fail.

test :-
    summaryScores(Id, Lastname, ScoreSum, ScoreMean),
    writeln(summaryScores(Id, Lastname, ScoreSum, ScoreMean)), fail.

/* Put hilog declarations together */

date_number(Id)(Number) :-
    visit(Id, date(Y,M,D), _), 
    julian_date(Number, Y, M, D, _, _, _).
		   
scores(Id)(Score) :-
    visit(Id, _, Score),
    \+is_nan(Score).

:- hilog maximum.
maximum(X,Y,Z) :- X @> Y -> Z=X ; Z=Y.
:- hilog sum.
sum(X,Y,Z) :- Z is X+Y.
:- hilog successor.
successor(X,_Y,Z) :- Z is X+1.

PureBasic[edit]

Structure Person
  Name$
EndStructure

Structure Visits
  Datum$
  Score$
EndStructure

Structure Merge
  Patient.Person
  List PVisit.Visits()
EndStructure

NewMap P.Merge()
NewList ID$()

If ReadFile(1,"./Data/patients.csv")=0 : End 1 : EndIf
header=1
While Not Eof(1)
  buf1$=ReadString(1)
  If header=1 : header=0 : Continue : EndIf
  bufId$=StringField(buf1$,1,",")
  P(bufId$)\Patient\Name$=StringField(buf1$,2,",")
  AddElement(ID$()) : ID$()=bufId$
Wend
CloseFile(1)

If ReadFile(2,"./Data/visits.csv")=0 : End 2 : EndIf
header=1
While Not Eof(2)
  buf1$=ReadString(2)
  If header=1 : header=0 : Continue : EndIf 
  bufId$=StringField(buf1$,1,",")
  AddElement(P(bufId$)\PVisit()) 
  P(bufId$)\PVisit()\Datum$=StringField(buf1$,2,",")
  P(bufId$)\PVisit()\Score$=StringField(buf1$,3,",")
Wend
CloseFile(2)

If OpenConsole()=0 : End 3 : EndIf
SortList(ID$(),#PB_Sort_Ascending)
PrintN("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
ForEach ID$()
  Print("| "+LSet(ID$(),11))
  Print("| "+LSet(P(ID$())\Patient\Name$,9)+"|")
  SortStructuredList(P(ID$())\PVisit(),#PB_Sort_Ascending,OffsetOf(Visits\Datum$),TypeOf(Visits\Datum$))  
  ForEach P(ID$())\PVisit() 
    scs.f+ValF(p(ID$())\PVisit()\Score$) : c+Bool(ValF(p(ID$())\PVisit()\Score$))
  Next
  If LastElement(P(ID$())\PVisit())    
    sca.f=scs/c 
    Print(" "+LSet(P(ID$())\PVisit()\Datum$,10)+" |")
    Print(RSet(StrF(scs,1),10)+" |")
    If Not IsNAN(sca) : Print(RSet(StrF(sca,2),10)+" |") : Else : Print(Space(11)+"|") : EndIf
  Else
    Print(Space(12)+"|"+Space(11)+"|"+Space(11)+"|")
  EndIf  
  PrintN("") : scs=0 : c=0
Next
Input()
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 |      17.4 |      5.80 |
| 2002       | Gosling  | 2020-10-08 |       6.8 |      6.80 |
| 3003       | Kemeny   | 2020-11-12 |       0.0 |           |
| 4004       | Wirth    | 2020-11-05 |      15.4 |      7.70 |
| 5005       | Kurtz    |            |           |           |

Python[edit]

Python: Using pandas library[edit]

# to install pandas library go to cmd prompt and type:
# cd %USERPROFILE%\AppData\Local\Programs\Python\Python38-32\Scripts\
# pip install pandas
import pandas as pd

# load data from csv files
df_patients = pd.read_csv (r'patients.csv', sep = ",", decimal=".")
df_visits = pd.read_csv (r'visits.csv', sep = ",", decimal=".")

''' # load data hard coded, create data frames
import io
str_patients = """PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
"""
df_patients = pd.read_csv(io.StringIO(str_patients), sep = ",", decimal=".")
str_visits = """PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
"""
df_visits = pd.read_csv(io.StringIO(str_visits), sep = ",", decimal=".")
'''

# typecast from string to datetime so .agg can 'max' it
df_visits['VISIT_DATE'] = pd.to_datetime(df_visits['VISIT_DATE'])

# merge on PATIENT_ID
df_merge = df_patients.merge(df_visits, on='PATIENT_ID', how='left')

# groupby is an intermediate object
df_group = df_merge.groupby(['PATIENT_ID','LASTNAME'], as_index=False)

# note: you can use 'sum' instead of the lambda function but that returns NaN as 0 (zero)
df_result = df_group.agg({'VISIT_DATE': 'max', 'SCORE': [lambda x: x.sum(min_count=1),'mean']})

print(df_result)
  PATIENT_ID LASTNAME LAST_VISIT      SCORE
                             max <lambda_0> mean
0       1001   Hopper 2020-11-19       17.4  5.8
1       2002  Gosling 2020-10-08        6.8  6.8
2       3003   Kemeny 2020-11-12        NaN  NaN
3       4004    Wirth 2020-11-05       15.4  7.7
4       5005    Kurtz        NaT        NaN  NaN

Python: Stdlib csv only[edit]

Using only standard libraries and input from csv files.

import csv

fnames = 'patients.csv  patients_visits.csv'.split()

def csv2list(fname):
    with open(fname) as f:
        rows = list(csv.reader(f))
    return rows

patients, visits = data = [csv2list(fname) for fname in fnames]
result = [record.copy() for record in patients]
result[1:] = sorted(result[1:])
#%%
result[0].append('LAST_VISIT')
last = {p: vis for p, vis, *score in visits[1:]}
for record in result[1:]:
    p = record[0]
    record.append(last.get(p, ''))
#%%
result[0] += ['SCORE_SUM', 'SCORE_AVG']
n = {p: 0 for p, *_ in patients[1:]}
tot = n.copy()
for record in visits[1:]:
    p, _, score = record
    if score:
        n[p] += 1
        tot[p] += float(score)
for record in result[1:]:
    p = record[0]
    if n[p]:
        record += [f"{tot[p]:5.1f}", f"{tot[p] / n[p]:5.2f}"]
    else:
        record += ['', '']
#%%
for record in result:
    print(f"| {' | '.join(f'{r:^10}' for r in record)} |")
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.80    |
|    2002    |  Gosling   | 2020-10-08 |     6.8    |    6.80    |
|    3003    |   Kemeny   | 2020-11-12 |            |            |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.70    |
|    5005    |   Kurtz    |            |            |            |


Python: Stdlib sqlite3 and csv only[edit]

Using the csv module just to parse inputs; and the sqlite3 module, (which which is also a standard library that comes with the base Python install), to calculate the output.
(The SQL SELECT statement is modelled on that of the SQL entry elsewhere on this page).

import sqlite3
import csv


fnames = 'patients.csv  patients_visits.csv'.split()
conn = sqlite3.connect(":memory:")
#%%
    
def create_table_headers(conn):
    curs = conn.cursor()
    curs.execute('''
      CREATE TABLE patients(PATIENT_ID INT, LASTNAME TEXT);
    ''')
    curs.execute('''
      CREATE TABLE patients_visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));
    ''')
    conn.commit()

def fill_tables(conn, fnames):
    curs = conn.cursor()
    for fname in fnames:
        with open(fname) as f:
            tablename = fname.replace('.csv', '')
            #
            csvdata = csv.reader(f)
            header = next(csvdata)
            fields = ','.join('?' for _ in header)
            for row in csvdata:
                row = [(None if r == '' else r) for r in row]
                curs.execute(f"INSERT INTO {tablename} VALUES ({fields});", row)
    conn.commit()

def join_tables_and_group(conn):
    curs = conn.cursor()
    curs.execute('''
CREATE TABLE answer AS
    SELECT
    	patients.PATIENT_ID,
    	patients.LASTNAME,
    	MAX(VISIT_DATE) AS LAST_VISIT,
    	SUM(SCORE) AS SCORE_SUM,
    	CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG
    FROM
    	patients
    	LEFT JOIN patients_visits
    		ON patients_visits.PATIENT_ID = patients.PATIENT_ID
    GROUP BY
    	patients.PATIENT_ID,
    	patients.LASTNAME
    ORDER BY
    	patients.PATIENT_ID;
        ''')
    curs.execute('''
SELECT * FROM answer;
        ''')
    conn.commit()
    rows = list(curs.fetchall())
    headers = tuple(d[0] for d in curs.description)
    return [headers] + rows
                 
create_table_headers(conn)
fill_tables(conn, fnames)
result = join_tables_and_group(conn)
for record in result:
    print(f"| {' | '.join(f'{str(r):^10}' for r in record)} |")
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.8     |
|    2002    |  Gosling   | 2020-10-08 |    6.8     |    6.8     |
|    3003    |   Kemeny   | 2020-11-12 |    None    |    None    |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.7     |
|    5005    |   Kurtz    |    None    |    None    |    None    |

R[edit]

# load data from csv files
# setwd("C:\Temp\csv\")
# df_patient <- read.csv(file="patients.csv", header = TRUE, sep = ",")
# df_visits <- read.csv(file="visits.csv", header = TRUE, sep = ",", dec = ".", colClasses=c("character","character","numeric"))

# load data hard coded, create data frames
df_patient <- read.table(text = "
PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz
", header = TRUE, sep = ",") #  character fields so no need for extra parameters colClasses etc.

df_visits <- read.table(text = "
PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3
", header = TRUE, dec = ".", sep = ",", colClasses=c("character","character","numeric"))

# aggregate visit date and scores
df_agg <- data.frame(
  cbind(
    PATIENT_ID = names(tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE)),
    last_visit = tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE),
    score_sum = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), sum, na.rm=TRUE),
    score_avg = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), mean, na.rm=TRUE)
  )
) 

# merge patients and aggregate dataset
# all.x = all the non matching cases of df_patient are appended to the result as well (i.e. 'left join')
df_result <- merge(df_patient, df_agg, by = 'PATIENT_ID', all.x = TRUE)

print(df_result)
Output:
  PATIENT_ID LASTNAME last_visit score_sum score_avg
1       1001   Hopper 2020-11-19      17.4       5.8
2       2002  Gosling 2020-10-08       6.8       6.8
3       3003   Kemeny 2020-11-12         0       NaN
4       4004    Wirth 2020-11-05      15.4       7.7
5       5005    Kurtz       <NA>      <NA>      <NA>

Raku[edit]

my @names = map { ( <PATIENT_ID LASTNAME> Z=> .list ).hash },
    ( 1001, 'Hopper'  ),
    ( 4004, 'Wirth'   ),
    ( 3003, 'Kemeny'  ),
    ( 2002, 'Gosling' ),
    ( 5005, 'Kurtz'   ),
;
my @visits = map { ( <PATIENT_ID VISIT_DATE SCORE> Z=> .list ).hash },
    ( 2002, '2020-09-10', 6.8 ),
    ( 1001, '2020-09-17', 5.5 ),
    ( 4004, '2020-09-24', 8.4 ),
    ( 2002, '2020-10-08', Nil ),
    ( 1001,         Nil , 6.6 ),
    ( 3003, '2020-11-12', Nil ),
    ( 4004, '2020-11-05', 7.0 ),
    ( 1001, '2020-11-19', 5.3 ),
;

my %v = @visits.classify: *.<PATIENT_ID>;

my @result = gather for @names -> %n {
    my @p = %v{ %n.<PATIENT_ID> }<>;

    my @dates  = @p».<VISIT_DATE>.grep: *.defined;
    my @scores = @p».<     SCORE>.grep: *.defined;

    take {
        %n,
        LAST_VISIT => ( @dates.max          if @dates  ),
        SCORE_AVG  => ( @scores.sum/@scores if @scores ),
        SCORE_SUM  => ( @scores.sum         if @scores ),
    };
}

my @out_field_names = <PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG>;
my @rows = @result.sort(*.<PATIENT_ID>).map(*.{@out_field_names});
say .map({$_ // ''}).fmt('%-10s', ' | ') for @out_field_names, |@rows;
Output:
PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG 
1001       | Hopper     | 2020-11-19 | 17.4       | 5.8       
2002       | Gosling    | 2020-10-08 | 6.8        | 6.8       
3003       | Kemeny     | 2020-11-12 |            |           
4004       | Wirth      | 2020-11-05 | 15.4       | 7.7       
5005       | Kurtz      |            |            |           

REXX[edit]

/* REXX */
patients='patients.csv'
l=linein(patients)
Parse Var l h1 ',' h2
n=0
idl=''
Do n=1 By 1 While lines(patients)>0
  l=linein(patients)
  Parse Var l id ',' lastname.id
  idl=idl id
  End
n=n-1 /* number of patients */
visits='visits.csv'
l=linein(visits) /* skip the header line of this file */  
h3='LAST_VISIT'
h4='SCORE_SUM'
h5='SCORE_AVG'
date.=''
score.=0
Say '|' h1 '|' h2 '|' h3 '|' h4 '|' h5 '|'
Do While lines(visits)>0
  l=linein(visits)
  Parse Var l id ',' date ',' score
  if date>date.id Then date.id=date
  If score>'' Then Do
    z=score.id.0+1
    score.id.z=score
    score.id.0=z
    End
  end
idl=wordsort(idl)
Do While idl<>''
  Parse Var idl id idl
  If date.id='' Then date.id=copies(' ',10)
  ol='|' left(id,length(h1)) '|' left(lastname.id,length(h2)),
                             '|' left(date.id,length(h3))
  If score.id.0=0 Then Do
  ol=ol '|' left(' ',length(h4)) '|',
            left(' ',length(h5)) '|'
    score_sum=copies(' ',length(h4))
    score_avg=copies(' ',length(h4))
    End
  Else Do
    score_sum=0
    Do j=1 To score.id.0
      score_sum=score_sum+score.id.j
      End
    score_avg=score_sum/score.id.0
    ol=ol '|' left(format(score_sum,2,1),length(h4)) '|',
             left(format(score_avg,2,1),length(h5)) '|'
    End
  Say ol
  End
Exit

wordsort: Procedure
/**********************************************************************
* Sort the list of words supplied as argument. Return the sorted list
**********************************************************************/
  Parse Arg wl
  wa.=''
  wa.0=0
  Do While wl<>''
    Parse Var wl w wl
    Do i=1 To wa.0
      If wa.i>w Then Leave
      End
    If i<=wa.0 Then Do
      Do j=wa.0 To i By -1
        ii=j+1
        wa.ii=wa.j
        End
      End
    wa.i=w
    wa.0=wa.0+1
    End
  swl=''
  Do i=1 To wa.0
    swl=swl wa.i
    End
  Return strip(swl)
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      |  5.8      |
| 2002       | Gosling  | 2020-10-08 |  6.8      |  6.8      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      |  7.7      |
| 5005       | Kurtz    |            |           |           |

SAS[edit]

  %let datefmt=E8601DA10.;
  data patient;
      infile "patient.csv" dsd dlm=',';
      attrib
          id length=4
          lastname length=$10; 
      input id lastname;
  data visit;
      infile "visit.csv" dsd dlm=',';
      attrib
          id length=4
          date informat=&datefmt format=&datefmt
          score length=8; 
      input id date score;
  proc sql;
      select * from 
          (select id, max(date) format=&datefmt as max_date, sum(score) as sum_score,
          	avg(score) as avg_score from visit group by id)
          natural right join patient
          order by id;
Output:
:       id  lastname      max_date  sum_score  avg_score
: ------------------------------------------------------
:     1001  Hopper      2020-11-19       17.4        5.8
:     2002  Gosling     2020-10-08        6.8        6.8
:     3003  Kemeny      2020-11-12          .          .
:     4004  Wirth       2020-11-05       15.4        7.7
:     5005  Kurtz                .          .          .

SPSS[edit]

Note: the PATIENT_ID 5005 is missing from the result, because MATCH FILES cannot add rows for missing keys, i.e. AFAIK a "left join" is not possible is SPSS.

* set working directory to location of .csv files

CD 'C:\Temp\csv\'.

* load patients csv data

GET DATA /TYPE=TXT
  /FILE="patients.csv"
  /ENCODING='UTF8'
  /DELCASE=LINE
  /DELIMITERS=","
  /QUALIFIER='"'
  /ARRANGEMENT=DELIMITED
  /FIRSTCASE=2
  /IMPORTCASE=ALL
  /VARIABLES=
PATIENT_ID F5.0
LASTNAME A20
.
CACHE.
EXECUTE.

* sort cases is needed to match files

SORT CASES BY PATIENT_ID (A).
DATASET NAME Patients WINDOW=FRONT.

* load visits csv data

GET DATA /TYPE=TXT
  /FILE="visit.csv"
  /ENCODING='UTF8'
  /DELCASE=LINE
  /DELIMITERS=","
  /QUALIFIER='"'
  /ARRANGEMENT=DELIMITED
  /FIRSTCASE=2
  /IMPORTCASE=ALL
  /VARIABLES=
PATIENT_ID F5.0
VISIT_DATE SDATE10
SCORE F4.1
.
CACHE.
EXECUTE.

* sort cases is needed, else match files will raise error "Files out of order"

SORT CASES BY PATIENT_ID (A) VISIT_DATE (A).
DATASET NAME Visits WINDOW=FRONT.

* load visits csv data

* merge datasets, one to many, FILE is the 'one', TABLE is 'many'

MATCH FILES TABLE = Patients / FILE = Visits
/BY PATIENT_ID.
EXECUTE.

* aggregate visit date and scores, group by and order (A)=ascending or (D)=descending 

AGGREGATE OUTFILE *
/BREAK=PATIENT_ID(A)
/last_visit = MAX(VISIT_DATE)
/score_avg = MEAN(SCORE)
/score_sum = SUM(SCORE).
Output:
| PATIENT_ID | LASTNAME | last_visit | score_sum | score_avg |
| 1001       |  Hopper  | 2020-11-19 |      17.4 |       5.8 |
| 2002       |  Gosling | 2020-10-08 |       6.8 |       6.8 |
| 3003       |  Kemeny  | 2020-11-12 |         . |         . |
| 4004       |  Wirth   | 2020-11-05 |      15.4 |       7.7 |

SQL[edit]

-- drop tables
DROP TABLE IF EXISTS tmp_patients;
DROP TABLE IF EXISTS tmp_visits;

-- create tables
CREATE TABLE tmp_patients(
	PATIENT_ID INT,
	LASTNAME VARCHAR(20)
);

CREATE TABLE tmp_visits(
	PATIENT_ID INT,
	VISIT_DATE DATE,
	SCORE NUMERIC(4,1)
);

-- load data from csv files
/*
-- Note: LOAD DATA LOCAL requires `local-infile` enabled on both the client and server else you get error "#1148 command is not allowed.."
LOAD DATA LOCAL INFILE '/home/csv/patients.csv' INTO TABLE `tmp_patients` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
LOAD DATA LOCAL INFILE '/home/csv/visits.csv' INTO TABLE `tmp_visits` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
*/

-- load data hard coded
INSERT INTO tmp_patients(PATIENT_ID, LASTNAME)
VALUES
(1001, 'Hopper'),
(4004, 'Wirth'),
(3003, 'Kemeny'),
(2002, 'Gosling'),
(5005, 'Kurtz');

INSERT INTO tmp_visits(PATIENT_ID, VISIT_DATE, SCORE)
VALUES
(2002, '2020-09-10', 6.8),
(1001, '2020-09-17', 5.5),
(4004, '2020-09-24', 8.4),
(2002, '2020-10-08', NULL),
(1001, NULL, 6.6),
(3003, '2020-11-12', NULL),
(4004, '2020-11-05', 7.0),
(1001, '2020-11-19', 5.3);

-- join tables and group
SELECT
	p.PATIENT_ID,
	p.LASTNAME,
	MAX(VISIT_DATE) AS LAST_VISIT,
	SUM(SCORE) AS SCORE_SUM,
	CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG
FROM
	tmp_patients p
	LEFT JOIN tmp_visits v
		ON v.PATIENT_ID = p.PATIENT_ID
GROUP BY
	p.PATIENT_ID,
	p.LASTNAME
ORDER BY
	p.PATIENT_ID;
Output:
PATIENT_ID  LASTNAME  LAST_VISIT  SCORE_SUM  SCORE_AVG
1001        Hopper    2020-11-19       17.4       5.80
2002        Gosling   2020-10-08        6.8       6.80
3003        Kemeny    2020-11-12       NULL       NULL
4004        Wirth     2020-11-05       15.4       7.70
5005        Kurtz     NULL             NULL       NULL

Transd[edit]

#lang transd

MainModule: {

tbl: `1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz`,

tbl1: `
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3`,

cols: `@key_PATIENT_ID:Int,
        LASTNAME:String,
        VISIT_DATE:DateTime,
        SCORE:Double,
        SCORE_AVG:Double,
        NUM_VISITS:Int`,

Record : typealias(Tuple<Int DateTime Double>()),

	_start: (λ (with base TSDBase()
		(load-table base tbl colNames: cols)
        (build-index base "PATIENT_ID")

        (with vizs Vector<Record>() 
            (load-table vizs tbl1 :mixedTypes fieldSep: "," rowSep: "\n" )
            (for viz in vizs do
                (tsd-query base 
                    :update set:
                (lambda PATIENT_ID Int() VISIT_DATE DateTime() 
                        SCORE Double() SCORE_AVG Double() NUM_VISITS Int()
                    (+= NUM_VISITS 1)
                    (set VISIT_DATE (get viz 1)) 
                    (set SCORE (+ SCORE (get viz 2)))
                    (set SCORE_AVG (/ SCORE NUM_VISITS)))
            where: (lambda PATIENT_ID Int() (eq PATIENT_ID (get viz 0))))
            ))
        
        (with cols ["PATIENT_ID","LASTNAME","VISIT_DATE","SCORE","SCORE_AVG"]
        (with recs (tsd-query base select: cols 
                as: [[Int(), String(), DateTime(), Double(), Double()]]
                where: (lambda PATIENT_ID Int() true)
                )
                (for i in cols do (textout width: 10 i "|")) (lout "")
                (for rec in recs do 
                    (for-each rec (λ i :Data() (textout width: 10 i "|" )))
                    (lout ""))
		))
	))
}
Output:
PATIENT_ID|  LASTNAME|VISIT_DATE|     SCORE| SCORE_AVG|
      1001|    Hopper|2020-11-19|      17.4|       5.8|
      4004|     Wirth| 2020-11-5|      15.4|       7.7|
      3003|    Kemeny|2020-11-12|         0|         0|
      2002|   Gosling| 2020-10-8|       6.8|       3.4|
      5005|     Kurtz|          |         0|         0|

TutorialD[edit]

BEGIN; 
  TYPE Date UNION; 
  TYPE DateValid IS {Date POSSREP {year INTEGER, month INTEGER, day INTEGER}};
  TYPE DateNone IS {Date POSSREP {}};
  TYPE DateUnknown IS {Date POSSREP {}};
END;

VAR patient REAL RELATION {id INT, lastname CHAR} KEY {id};

INSERT patient RELATION
{TUPLE {id 1001, lastname 'Hopper'},
 TUPLE {id 4004, lastname 'Wirth'},
 TUPLE {id 3003, lastname 'Kemeny'},
 TUPLE {id 2002, lastname 'Gosling'},
 TUPLE {id 5005, lastname 'Kurtz'}
 };

VAR visit REAL RELATION {id INT, date Date, score RATIONAL} KEY {id, date};

INSERT visit RELATION
{
TUPLE {id 2002, date DateValid(2020,09,10), score 6.8},
TUPLE {id 1001, date DateValid(2020,09,17), score 5.5},
TUPLE {id 4004, date DateValid(2020,09,24), score 8.4},
TUPLE {id 2002, date DateValid(2020,10,08), score NAN},
TUPLE {id 1001, date DateNone(), score 6.6},
TUPLE {id 3003, date DateValid(2020,11,12), score NAN},
TUPLE {id 4004, date DateValid(2020,11,05), score 7.0},
TUPLE {id 1001, date DateValid(2020,11,19), score 5.3}
};

((SUMMARIZE (visit WHERE score>0.0) BY {id}: {sumscore := SUM(score), avgscore := AVG(score)}) UNION
 (EXTEND (patient {id} MINUS ((visit WHERE score>0.0) {id})): {sumscore := NaN, avgscore := NaN})) JOIN
    (SUMMARIZE visit BY {id}: {maxdate := MAX(date)} UNION
     (EXTEND (patient {id} MINUS (visit {id})): {maxdate := DateUnknown()})) JOIN
    patient
Output:
id	sumscore	avgscore	maxdate			lastname
1001	17.4		5.8		DateValid(2020, 11, 19)	Hopper
2002	6.8		6.8		DateValid(2020, 10, 8)	Gosling
4004	15.4		7.7		DateValid(2020, 11, 5)	Wirth
3003	NaN		NaN		DateValid(2020, 11, 12)	Kemeny
5005	NaN		NaN		DateUnknown()		Kurtz

Wren[edit]

Library: Wren-sort
Library: Wren-fmt
import "/fmt" for Fmt
import "/sort" for Sort

class Patient {
    construct new(id, lastName) {
        _id = id
        _lastName = lastName
        if (!__dir) __dir = {}
        __dir[id] = lastName
        if (!__ids) {
            __ids = [id]
        } else {
            __ids.add(id)
            Sort.insertion(__ids)
        }
    }

    id       { _id }
    lastName { _lastName }

    // maps an id to a lastname
    static dir { __dir }

    // maintains a sorted list of ids
    static ids { __ids }
}

class Visit {
    construct new(id, date, score) {
        _id = id
        _date = date || "0000-00-00"
        _score = score
        if (!__dir) __dir = {}
        if (!__dir[id]) {
            __dir[id] = [ [_date], [score] ]
        } else {
            __dir[id][0].add(_date)
            __dir[id][1].add(score)
        }
    }

    id    { _id }
    date  { _date }
    score { _score }

     // maps an id to lists of dates and scores
    static dir { __dir }
}

class Merge {
    construct new(id) {
        _id = id
    }

    id        { _id }
    lastName  { Patient.dir[_id] }
    dates     { Visit.dir[_id][0] }
    scores    { Visit.dir[_id][1] }
    lastVisit { Sort.merge(dates)[-1] }

    scoreSum  { scores.reduce(0) { |acc, s| s ? acc + s : acc } }
    scoreAvg  { scoreSum / scores.count { |s| s } }

    static print(merges) {
        System.print("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
        var fmt = "| $d       | $-7s  | $s | $4s      | $4s      |"
        for (m in merges) {
            if (Visit.dir[m.id]) {
                var lv = (m.lastVisit != "0000-00-00") ? m.lastVisit : "          "
                var ss = (m.scoreSum > 0) ? Fmt.f(4, m.scoreSum, 1) : "    "
                var sa = (!m.scoreAvg.isNan) ? Fmt.f(4, m.scoreAvg, 2) : "    " 
                Fmt.print(fmt, m.id, m.lastName, lv, ss, sa)
            } else {
                Fmt.print(fmt, m.id, m.lastName, "          ", "    ", "    ")
            }
        }
    }
}

Patient.new(1001, "Hopper")
Patient.new(4004, "Wirth")
Patient.new(3003, "Kemeny")
Patient.new(2002, "Gosling")
Patient.new(5005, "Kurtz")

Visit.new(2002, "2020-09-10", 6.8)
Visit.new(1001, "2020-09-17", 5.5)
Visit.new(4004, "2020-09-24", 8.4)
Visit.new(2002, "2020-10-08", null)
Visit.new(1001,  null       , 6.6)
Visit.new(3003, "2020-11-12", null)
Visit.new(4004, "2020-11-05", 7.0)
Visit.new(1001, "2020-11-19", 5.3)

var merges = Patient.ids.map { |id| Merge.new(id) }.toList
Merge.print(merges)
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |