Merge and aggregate datasets
Merge and aggregate datasets
You are encouraged to solve this task according to the task description, using any language you may know.
- Task
Merge and aggregate two datasets as provided in .csv files into a new resulting dataset.
Use the appropriate methods and data structures depending on the programming language.
Use the most common libraries only when built-in functionality is not sufficient.
- Note
Either load the data from the .csv files or create the required data structures hard-coded.
patients.csv file contents:
PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz
visits.csv file contents:
PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3
Create a resulting dataset in-memory or output it to screen or file, whichever is appropriate for the programming language at hand.
Merge and group per patient id and last name, get the maximum visit date, and get the sum and average of the scores per patient to get the resulting dataset.
Note that the visit date is purposefully provided as ISO format, so that it could also be processed as text and sorted alphabetically to determine the maximum date.
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
- Note
This task is aimed in particular at programming languages that are used in data science and data processing, such as F#, Python, R, SPSS, MATLAB etc.
- Related tasks
AutoHotkey
<lang AutoHotkey>Merge_and_aggregate(patients, visits){
ID := [], LAST_VISIT := [], SCORE_SUM := [], VISIT := [] for i, line in StrSplit(patients, "`n", "`r"){ if (i=1) continue x := StrSplit(line, ",") ID[x.1] := x.2 }
for i, line in StrSplit(visits, "`n", "`r"){ if (i=1) continue x := StrSplit(line, ",") LAST_VISIT[x.1] := x.2 > LAST_VISIT[x.1] ? x.2 : LAST_VISIT[x.1] SCORE_SUM[x.1] := (SCORE_SUM[x.1] ? SCORE_SUM[x.1] : 0) + (x.3 ? x.3 : 0) if x.3 VISIT[x.1] := (VISIT[x.1] ? VISIT[x.1] : 0) + 1 }
output := "PATIENT_ID`tLASTNAME`tLAST_VISIT`tSCORE_SUM`tSCORE_AVG`n" for id, name in ID output .= ID "`t" name "`t" LAST_VISIT[id] "`t" SCORE_SUM[id] "`t" SCORE_SUM[id]/VISIT[id] "`n" return output
}</lang> Examples:<lang AutoHotkey>patients = ( PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz )
visits = ( PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 )
MsgBox % Merge_and_aggregate(patients, visits) return</lang>
- Output:
PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG 1001 Hopper 2020-11-19 17.400000 5.800000 2002 Gosling 2020-10-08 6.800000 6.800000 3003 Kemeny 2020-11-12 0 4004 Wirth 2020-11-05 15.400000 7.700000 5005 Kurtz
AWK
<lang AWK>
- syntax: GAWK -f MERGE_AND_AGGREGATE_DATASETS.AWK RC-PATIENTS.CSV RC-VISITS.CSV
- files may appear in any order
- sorting:
- PROCINFO["sorted_in"] is used by GAWK
- SORTTYPE is used by Thompson Automation's TAWK
{ # printf("%s %s\n",FILENAME,$0) # print input
split($0,arr,",") if (FNR == 1) { file = (arr[2] == "LASTNAME") ? "patients" : "visits" next } patient_id_arr[key] = key = arr[1] if (file == "patients") { lastname_arr[key] = arr[2] } else if (file == "visits") { if (arr[2] > visit_date_arr[key]) { visit_date_arr[key] = arr[2] } if (arr[3] != "") { score_arr[key] += arr[3] score_count_arr[key]++ } }
} END {
print("") PROCINFO["sorted_in"] = "@ind_str_asc" ; SORTTYPE = 1 fmt = "%-10s %-10s %-10s %9s %9s %6s\n" printf(fmt,"patient_id","lastname","last_visit","score_sum","score_avg","scores") for (i in patient_id_arr) { avg = (score_count_arr[i] > 0) ? score_arr[i] / score_count_arr[i] : "" printf(fmt,patient_id_arr[i],lastname_arr[i],visit_date_arr[i],score_arr[i],avg,score_count_arr[i]+0) } exit(0)
} </lang>
- Output:
patient_id lastname last_visit score_sum score_avg scores 1001 Hopper 2020-11-19 17.4 5.8 3 2002 Gosling 2020-10-08 6.8 6.8 1 3003 Kemeny 2020-11-12 0 4004 Wirth 2020-11-05 15.4 7.7 2 5005 Kurtz 0
C++
Uses C++20 <lang cpp>#include <iostream>
- include <optional>
- include <ranges>
- include <string>
- include <vector>
using namespace std;
struct Patient {
string ID; string LastName;
};
struct Visit {
string PatientID; string Date; optional<float> Score;
};
int main(void) {
auto patients = vector<Patient> { {"1001", "Hopper"}, {"4004", "Wirth"}, {"3003", "Kemeny"}, {"2002", "Gosling"}, {"5005", "Kurtz"}};
auto visits = vector<Visit> { {"2002", "2020-09-10", 6.8}, {"1001", "2020-09-17", 5.5}, {"4004", "2020-09-24", 8.4}, {"2002", "2020-10-08", }, {"1001", "" , 6.6}, {"3003", "2020-11-12", }, {"4004", "2020-11-05", 7.0}, {"1001", "2020-11-19", 5.3}};
// sort the patients by ID sort(patients.begin(), patients.end(), [](const auto& a, const auto&b){ return a.ID < b.ID;});
cout << "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n"; for(const auto& patient : patients) { // loop over all of the patients and determine the fields string lastVisit; float sum = 0; int numScores = 0; // use C++20 ranges to filter the visits by patients auto patientFilter = [&patient](const Visit &v){return v.PatientID == patient.ID;}; for(const auto& visit : visits | views::filter( patientFilter )) { if(visit.Score) { sum += *visit.Score; numScores++; } lastVisit = max(lastVisit, visit.Date); } // format the output cout << "| " << patient.ID << " | "; cout.width(8); cout << patient.LastName << " | "; cout.width(10); cout << lastVisit << " | "; if(numScores > 0) { cout.width(9); cout << sum << " | "; cout.width(9); cout << (sum / float(numScores)); } else cout << " | "; cout << " |\n"; }
}</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
Datalog
The following implementation is for Souffle. Souffle does not currently include dates and times in the base distribution, but it is straightforward to use C timestamps: <lang C>#include <ctime>
- include <cstdint>
extern "C" {
int64_t from date(const char* string) { struct tm tmInfo = {0}; strptime(string, "%Y-%m-%d", &tmInfo); return mktime(&tmInfo); // localtime }
}</lang>
Rather than combine the summaries, the date and score summaries have been presented separately. <lang datalog>#define NaN to_float("NaN") .functor from_date(date:symbol) : number .decl Patient(id:number, lastname:symbol) .decl Visit(id:number, date:symbol, score:float) .decl SummaryDates(id:number, lastname:symbol, last_date:symbol) .decl SummaryScores(id:number, lastname:symbol, score_sum:float, score_mean:float) .decl MissingDates(x:number)
Patient(1001,"Hopper"). Patient(4004,"Wirth"). Patient(3003,"Kemeny"). Patient(2002,"Gosling"). Patient(5005,"Kurtz").
Visit(2002,"2020-09-10",6.8). Visit(1001,"2020-09-17",5.5). Visit(4004,"2020-09-24",8.4). Visit(2002,"2020-10-08",NaN). Visit(1001,"",6.6). Visit(3003,"2020-11-12",NaN). Visit(4004,"2020-11-05",7.0). Visit(1001,"2020-11-19",5.3).
MissingDates(@from_date("")) :- true.
SummaryDates(id, lastname, last_date) :-
Patient(id,lastname), last_timestamp = max ts: {Visit(id, date, _), ts = @from_date(date), !MissingDates(ts)}, Visit(id, last_date, _), last_timestamp = @from_date(last_date).
SummaryScores(id, lastname, score_sum, score_mean) :-
Patient(id,lastname), score_sum = sum score: {Visit(id, _, score), score != NaN}, score_mean = mean score: {Visit(id, _, score), score != NaN}.
.output SummaryDates .output SummaryScores</lang>
Then this is called using: <lang bash>g++ -shared -fPIC datetime.cpp -o libfunctors.so souffle rosetta.dl</lang>
- Output:
1001 Hopper 2020-11-19 2002 Gosling 2020-10-08 3003 Kemeny 2020-11-12 4004 Wirth 2020-11-05 1001 Hopper 17.399999999999999 5.7999999999999998 2002 Gosling 6.7999999999999998 6.7999999999999998 4004 Wirth 15.4 7.7000000000000002
F#
Note that the scores are right justified to copy the task description. It would be more natural to leave them right justified. <lang fsharp> // Merge and aggregate datasets. Nigel Galloway: January 6th., 2021 let rFile(fName)=seq{use n=System.IO.File.OpenText(fName)
n.ReadLine() |> ignore while not n.EndOfStream do yield n.ReadLine().Split [|','|]}
let N=rFile("file1.txt") |> Seq.sort let G=rFile("file2.txt") |> Seq.groupBy(fun n->n.[0]) |> Map.ofSeq let fN n i g e l=printfn "| %-10s | %-8s | %10s | %-9s | %-9s |" n i g e l let fG n g=let z=G.[n]|>Seq.sumBy(fun n->try float n.[2] with :? System.FormatException->0.0)
fN n g (G.[n]|>Seq.sort|>Seq.last).[1] (if z=0.0 then "" else string z) (if z=0.0 then "" else string(z/(float(Seq.length G.[n]))))
</lang>
- Output:
printfn "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" N|>Seq.iter(fun n->match G.ContainsKey n.[0] with true->fG n.[0] n.[1] |_->fN n.[0] n.[1] "" "" "") | PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 3.4 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
Go
<lang go>package main
import (
"fmt" "math" "sort"
)
type Patient struct {
id int lastName string
}
// maps an id to a lastname var patientDir = make(map[int]string)
// maintains a sorted list of ids var patientIds []int
func patientNew(id int, lastName string) Patient {
patientDir[id] = lastName patientIds = append(patientIds, id) sort.Ints(patientIds) return Patient{id, lastName}
}
type DS struct {
dates []string scores []float64
}
type Visit struct {
id int date string score float64
}
// maps an id to lists of dates and scores var visitDir = make(map[int]DS)
func visitNew(id int, date string, score float64) Visit {
if date == "" { date = "0000-00-00" } v, ok := visitDir[id] if ok { v.dates = append(v.dates, date) v.scores = append(v.scores, score) visitDir[id] = DS{v.dates, v.scores} } else { visitDir[id] = DS{[]string{date}, []float64{score}} } return Visit{id, date, score}
}
type Merge struct{ id int }
func (m Merge) lastName() string { return patientDir[m.id] } func (m Merge) dates() []string { return visitDir[m.id].dates } func (m Merge) scores() []float64 { return visitDir[m.id].scores }
func (m Merge) lastVisit() string {
dates := m.dates() dates2 := make([]string, len(dates)) copy(dates2, dates) sort.Strings(dates2) return dates2[len(dates2)-1]
}
func (m Merge) scoreSum() float64 {
sum := 0.0 for _, score := range m.scores() { if score != -1 { sum += score } } return sum
}
func (m Merge) scoreAvg() float64 {
count := 0 for _, score := range m.scores() { if score != -1 { count++ } } return m.scoreSum() / float64(count)
}
func mergePrint(merges []Merge) {
fmt.Println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |") f := "| %d | %-7s | %s | %4s | %4s |\n" for _, m := range merges { _, ok := visitDir[m.id] if ok { lv := m.lastVisit() if lv == "0000-00-00" { lv = " " } scoreSum := m.scoreSum() ss := fmt.Sprintf("%4.1f", scoreSum) if scoreSum == 0 { ss = " " } scoreAvg := m.scoreAvg() sa := " " if !math.IsNaN(scoreAvg) { sa = fmt.Sprintf("%4.2f", scoreAvg) } fmt.Printf(f, m.id, m.lastName(), lv, ss, sa) } else { fmt.Printf(f, m.id, m.lastName(), " ", " ", " ") } }
}
func main() {
patientNew(1001, "Hopper") patientNew(4004, "Wirth") patientNew(3003, "Kemeny") patientNew(2002, "Gosling") patientNew(5005, "Kurtz")
visitNew(2002, "2020-09-10", 6.8) visitNew(1001, "2020-09-17", 5.5) visitNew(4004, "2020-09-24", 8.4) visitNew(2002, "2020-10-08", -1) // -1 signifies no score visitNew(1001, "", 6.6) // "" signifies no date visitNew(3003, "2020-11-12", -1) visitNew(4004, "2020-11-05", 7.0) visitNew(1001, "2020-11-19", 5.3)
merges := make([]Merge, len(patientIds)) for i, id := range patientIds { merges[i] = Merge{id} } mergePrint(merges)
}</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
Julia
<lang julia>using CSV, DataFrames, Statistics
- load data from csv files
- df_patients = CSV.read("patients.csv", DataFrame)
- df_visits = CSV.read("visits.csv", DataFrame)
- create DataFrames from text that is hard coded, so use IOBuffer(String) as input
str_patients = IOBuffer("""PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """) df_patients = CSV.read(str_patients, DataFrame) str_visits = IOBuffer("""PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """) df_visits = CSV.read(str_visits, DataFrame)
- merge on PATIENT_ID, using an :outer join or we lose Kurtz, who has no data, sort by ID
df_merge = sort(join(df_patients, df_visits, on="PATIENT_ID", kind=:outer), (:PATIENT_ID,))
fnonmissing(a, f) = isempty(a) ? [] : isempty(skipmissing(a)) ? a[1] : f(skipmissing(a))
- group by patient id / last name and then aggregate to get latest visit and mean score
df_result = by(df_merge, [:PATIENT_ID, :LASTNAME]) do df
DataFrame(LATEST_VISIT = fnonmissing(df[:VISIT_DATE], maximum), SUM_SCORE = fnonmissing(df[:SCORE], sum), MEAN_SCORE = fnonmissing(df[:SCORE], mean))
end println(df_result)
</lang>
- Output:
5×5 DataFrame │ Row │ PATIENT_ID │ LASTNAME │ LATEST_VISIT │ SUM_SCORE │ MEAN_SCORE │ │ │ Int64? │ String? │ Dates.Date? │ Float64? │ Float64? │ ├─────┼────────────┼──────────┼──────────────┼───────────┼────────────┤ │ 1 │ 1001 │ Hopper │ 2020-11-19 │ 17.4 │ 5.8 │ │ 2 │ 2002 │ Gosling │ 2020-10-08 │ 6.8 │ 6.8 │ │ 3 │ 3003 │ Kemeny │ 2020-11-12 │ missing │ missing │ │ 4 │ 4004 │ Wirth │ 2020-11-05 │ 15.4 │ 7.7 │ │ 5 │ 5005 │ Kurtz │ missing │ missing │ missing │
Nim
CSV files and tables
<lang Nim>import algorithm, parsecsv, strformat, strutils, tables
const NoValue = -1.0
type
Names = OrderedTable[Positive, string] # Mapping id -> last name.
Visit = tuple[date: string; score: float] Visits = Table[Positive, seq[Visit]] # Mapping id -> list of visits.
proc readNames(path: string): Names =
## Read the records (id, lastname) from the CSV file and fill a Names table. var parser: CsvParser parser.open(path) parser.readHeaderRow() while parser.readRow(): let id = parser.row[0].parseInt let name = parser.row[1] result[id] = name
proc readVisits(path: string): Visits =
## Read the records (id, date, score) from the CSV file and fill a Visits table. var parser: CsvParser parser.open(path) parser.readHeaderRow() while parser.readRow(): let id = parser.row[0].parseInt let date = parser.row[1] let score = if parser.row[2].len == 0: NoValue else: parser.row[2].parseFloat result.mgetOrPut(id, @[]).add (date, score)
var
names = readNames("patients1.csv") visits = readVisits("patients2.csv")
names.sort(system.cmp)
echo "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" for (id, name) in names.pairs:
let visitList = visits.getOrDefault(id).sorted() let lastVisit = if visitList.len == 0: "" else: visitList[^1].date var sum = 0.0 var count = 0 for visit in visitList: if visit.score != NoValue: sum += visit.score inc count let scoreSum = if count == 0: "" else: &"{sum:>4.1f}" let scoreAvg = if count == 0: "" else: &"{sum / count.toFloat: >4.2f}" echo &"| {id:^10} | {name:^10} | {lastVisit:^10} | {scoreSum:>7} | {scoreAvg:>6} |"</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
CSV files and SQLite3
We use the high level standard library "db_sqlite" rather than the low level one "sqlite3". <lang Nim>import parseCsv, db_sqlite, sequtils, strutils
const FNames = ["patients1.csv", "patients2.csv"]
proc createTableHeaders(conn: DbConn) =
conn.exec(sql"CREATE TABLE names(PATIENT_ID INT, LASTNAME TEXT);") conn.exec(sql"CREATE TABLE visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));")
proc fillTables(dbc: DbConn) =
for idx, fname in FNames: dbc.exec(sql"BEGIN") var parser: CsvParser parser.open(fname) parser.readHeaderRow() while parser.readRow(): if idx == 0: # "names" table. dbc.exec(sql"INSERT INTO names VALUES (?, ?);", parser.row) else: # "visits" table dbc.exec(sql"INSERT INTO visits VALUES (?, ?, ?);", parser.row) dbc.exec(sql"COMMIT")
proc joinTablesAndGroup(dbc: DbConn): seq[Row] =
dbc.exec(sql"""CREATE TABLE answer AS SELECT names.PATIENT_ID, names.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM names LEFT JOIN visits ON visits.PATIENT_ID = names.PATIENT_ID GROUP BY names.PATIENT_ID, names.LASTNAME ORDER BY names.PATIENT_ID;""") result = dbc.getAllRows(sql"SELECT * FROM ANSWER")
- Build the database and execute the request to get the result.
let dbc = open(":memory:", "", "", "") dbc.createTableHeaders() dbc.fillTables() let result = dbc.joinTablesAndGroup() dbc.close()
- Print the result.
echo "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" for row in result:
echo "| " & row.mapit(it.center(10)).join(" | ") & '|'</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 3.4 | | 3003 | Kemeny | 2020-11-12 | 0.0 | 0 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
Phix
constant patients_txt = split(""" PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz""",'\n'), visits_txt = split(""" PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3""",'\n') --or, assuming these files contain exactly the same actual raw text, -- and the use of GT_LF_STRIPPED is instead of above split('\n'): --constant patients_txt = get_text("patients.csv",GT_LF_STRIPPED), -- visits_txt = get_text("visits.csv",GT_LF_STRIPPED) function sap(string s) return split(s,',',false)&{"",0,0} end function function svp(string s) return split(s,',',false) end function sequence patient_data = sort(apply(patients_txt[2..$],sap)), visit_data = sort_columns(apply(visits_txt[2..$],svp),{1,-2}) visit_data = append(visit_data,{"","","0"}) -- (add a sentinel) string last_id = "",id,name,dt,scstr,avstr atom score,score_total,average integer visit_count = 0, pdx = 1 for i=1 to length(visit_data) do {id,dt,scstr} = visit_data[i] score = iff(scstr=""?0:scanf(scstr,"%f")[1][1]) if id!=last_id then if visit_count then average = score_total/visit_count patient_data[pdx][4..5] = {score_total,average} end if if i=length(visit_data) then exit end if -- (sentinel) score_total = score visit_count = (score!=0) while id!=patient_data[pdx][1] do pdx += 1 end while patient_data[pdx][3] = dt last_id = id elsif score!=0 then score_total += score visit_count += 1 end if end for printf(1,"| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n") for i=1 to length(patient_data) do {id,name,dt,score,average} = patient_data[i] scstr = iff(score=0?"":sprintf("%4.1f",score)) avstr = iff(average=0?"":sprintf("%4.2f",average)) printf(1,"| %-10s | %-7s | %10s | %-9s | %-9s |\n", {id,name,dt,scstr,avstr}) end for
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
Perl
Weird requirement: "Use the most common libraries only when built-in functionality is not sufficient." Not even a "use strict;" :) <lang perl>#!/usr/bin/perl my $fmt = '| %-11s' x 5 . "|\n"; printf $fmt, qw( PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG); my ($names, $visits) = do { local $/; split /^\n/m, }; my %score; for ( $visits =~ /^\d.*/gm )
{ my ($id, undef, $score) = split /,/; $score{$id} //= [, ]; $score and $score{$id}[0]++, $score{$id}[1] += $score; }
for ( sort $names =~ /^\d.*/gm )
{ my ($id, $name) = split /,/; printf $fmt, $id, $name, ( sort $visits =~ /^$id,(.*?),/gm, )[-1], $score{$id}[0] ? ( $score{$id}[1], $score{$id}[1] / $score{$id}[0]) : (, ); }
__DATA__ PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz
PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
PureBasic
<lang PureBasic>Structure Person
Name$
EndStructure
Structure Visits
Datum$ Score$
EndStructure
Structure Merge
Patient.Person List PVisit.Visits()
EndStructure
NewMap P.Merge() NewList ID$()
If ReadFile(1,"./Data/patients.csv")=0 : End 1 : EndIf header=1 While Not Eof(1)
buf1$=ReadString(1) If header=1 : header=0 : Continue : EndIf bufId$=StringField(buf1$,1,",") P(bufId$)\Patient\Name$=StringField(buf1$,2,",") AddElement(ID$()) : ID$()=bufId$
Wend CloseFile(1)
If ReadFile(2,"./Data/visits.csv")=0 : End 2 : EndIf header=1 While Not Eof(2)
buf1$=ReadString(2) If header=1 : header=0 : Continue : EndIf bufId$=StringField(buf1$,1,",") AddElement(P(bufId$)\PVisit()) P(bufId$)\PVisit()\Datum$=StringField(buf1$,2,",") P(bufId$)\PVisit()\Score$=StringField(buf1$,3,",")
Wend CloseFile(2)
If OpenConsole()=0 : End 3 : EndIf SortList(ID$(),#PB_Sort_Ascending) PrintN("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |") ForEach ID$()
Print("| "+LSet(ID$(),11)) Print("| "+LSet(P(ID$())\Patient\Name$,9)+"|") SortStructuredList(P(ID$())\PVisit(),#PB_Sort_Ascending,OffsetOf(Visits\Datum$),TypeOf(Visits\Datum$)) ForEach P(ID$())\PVisit() scs.f+ValF(p(ID$())\PVisit()\Score$) : c+Bool(ValF(p(ID$())\PVisit()\Score$)) Next If LastElement(P(ID$())\PVisit()) sca.f=scs/c Print(" "+LSet(P(ID$())\PVisit()\Datum$,10)+" |") Print(RSet(StrF(scs,1),10)+" |") If Not IsNAN(sca) : Print(RSet(StrF(sca,2),10)+" |") : Else : Print(Space(11)+"|") : EndIf Else Print(Space(12)+"|"+Space(11)+"|"+Space(11)+"|") EndIf PrintN("") : scs=0 : c=0
Next Input()</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | 0.0 | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
Python
Python: Using pandas library
<lang Python># to install pandas library go to cmd prompt and type:
- cd %USERPROFILE%\AppData\Local\Programs\Python\Python38-32\Scripts\
- pip install pandas
import pandas as pd
- load data from csv files
df_patients = pd.read_csv (r'patients.csv', sep = ",", decimal=".") df_visits = pd.read_csv (r'visits.csv', sep = ",", decimal=".")
# load data hard coded, create data frames import io str_patients = """PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """ df_patients = pd.read_csv(io.StringIO(str_patients), sep = ",", decimal=".") str_visits = """PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """ df_visits = pd.read_csv(io.StringIO(str_visits), sep = ",", decimal=".")
- typecast from string to datetime so .agg can 'max' it
df_visits['VISIT_DATE'] = pd.to_datetime(df_visits['VISIT_DATE'])
- merge on PATIENT_ID
df_merge = df_patients.merge(df_visits, on='PATIENT_ID', how='left')
- groupby is an intermediate object
df_group = df_merge.groupby(['PATIENT_ID','LASTNAME'], as_index=False)
- note: you can use 'sum' instead of the lambda function but that returns NaN as 0 (zero)
df_result = df_group.agg({'VISIT_DATE': 'max', 'SCORE': [lambda x: x.sum(min_count=1),'mean']})
print(df_result)</lang>
PATIENT_ID LASTNAME LAST_VISIT SCORE max <lambda_0> mean 0 1001 Hopper 2020-11-19 17.4 5.8 1 2002 Gosling 2020-10-08 6.8 6.8 2 3003 Kemeny 2020-11-12 NaN NaN 3 4004 Wirth 2020-11-05 15.4 7.7 4 5005 Kurtz NaT NaN NaN
Python: Stdlib csv only
Using only standard libraries and input from csv files.
<lang python>import csv
fnames = 'patients.csv patients_visits.csv'.split()
def csv2list(fname):
with open(fname) as f: rows = list(csv.reader(f)) return rows
patients, visits = data = [csv2list(fname) for fname in fnames] result = [record.copy() for record in patients] result[1:] = sorted(result[1:])
- %%
result[0].append('LAST_VISIT') last = {p: vis for p, vis, *score in visits[1:]} for record in result[1:]:
p = record[0] record.append(last.get(p, ))
- %%
result[0] += ['SCORE_SUM', 'SCORE_AVG'] n = {p: 0 for p, *_ in patients[1:]} tot = n.copy() for record in visits[1:]:
p, _, score = record if score: n[p] += 1 tot[p] += float(score)
for record in result[1:]:
p = record[0] if n[p]: record += [f"{tot[p]:5.1f}", f"{tot[p] / n[p]:5.2f}"] else: record += [, ]
- %%
for record in result:
print(f"| {' | '.join(f'{r:^10}' for r in record)} |")</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |
Python: Stdlib sqlite3 and csv only
Using the csv module just to parse inputs; and the sqlite3 module, (which which is also a standard library that comes with the base Python install), to calculate the output.
(The SQL SELECT statement is modelled on that of the SQL entry elsewhere on this page).
<lang python>import sqlite3 import csv
fnames = 'patients.csv patients_visits.csv'.split()
conn = sqlite3.connect(":memory:")
- %%
def create_table_headers(conn):
curs = conn.cursor() curs.execute( CREATE TABLE patients(PATIENT_ID INT, LASTNAME TEXT); ) curs.execute( CREATE TABLE patients_visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1)); ) conn.commit()
def fill_tables(conn, fnames):
curs = conn.cursor() for fname in fnames: with open(fname) as f: tablename = fname.replace('.csv', ) # csvdata = csv.reader(f) header = next(csvdata) fields = ','.join('?' for _ in header) for row in csvdata: row = [(None if r == else r) for r in row] curs.execute(f"INSERT INTO {tablename} VALUES ({fields});", row) conn.commit()
def join_tables_and_group(conn):
curs = conn.cursor() curs.execute(
CREATE TABLE answer AS
SELECT patients.PATIENT_ID, patients.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM patients LEFT JOIN patients_visits ON patients_visits.PATIENT_ID = patients.PATIENT_ID GROUP BY patients.PATIENT_ID, patients.LASTNAME ORDER BY patients.PATIENT_ID; ) curs.execute(
SELECT * FROM answer;
) conn.commit() rows = list(curs.fetchall()) headers = tuple(d[0] for d in curs.description) return [headers] + rows
create_table_headers(conn) fill_tables(conn, fnames) result = join_tables_and_group(conn) for record in result:
print(f"| {' | '.join(f'{str(r):^10}' for r in record)} |")</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | None | None | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | None | None | None |
R
<lang R># load data from csv files
- setwd("C:\Temp\csv\")
- df_patient <- read.csv(file="patients.csv", header = TRUE, sep = ",")
- df_visits <- read.csv(file="visits.csv", header = TRUE, sep = ",", dec = ".", colClasses=c("character","character","numeric"))
- load data hard coded, create data frames
df_patient <- read.table(text = " PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz ", header = TRUE, sep = ",") # character fields so no need for extra parameters colClasses etc.
df_visits <- read.table(text = " PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 ", header = TRUE, dec = ".", sep = ",", colClasses=c("character","character","numeric"))
- aggregate visit date and scores
df_agg <- data.frame(
cbind( PATIENT_ID = names(tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE)), last_visit = tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE), score_sum = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), sum, na.rm=TRUE), score_avg = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), mean, na.rm=TRUE) )
)
- merge patients and aggregate dataset
- all.x = all the non matching cases of df_patient are appended to the result as well (i.e. 'left join')
df_result <- merge(df_patient, df_agg, by = 'PATIENT_ID', all.x = TRUE)
print(df_result)</lang>
- Output:
PATIENT_ID LASTNAME last_visit score_sum score_avg 1 1001 Hopper 2020-11-19 17.4 5.8 2 2002 Gosling 2020-10-08 6.8 6.8 3 3003 Kemeny 2020-11-12 0 NaN 4 4004 Wirth 2020-11-05 15.4 7.7 5 5005 Kurtz <NA> <NA> <NA>
Raku
<lang perl6>my @names = map { ( <PATIENT_ID LASTNAME> Z=> .list ).hash },
( 1001, 'Hopper' ), ( 4004, 'Wirth' ), ( 3003, 'Kemeny' ), ( 2002, 'Gosling' ), ( 5005, 'Kurtz' ),
my @visits = map { ( <PATIENT_ID VISIT_DATE SCORE> Z=> .list ).hash },
( 2002, '2020-09-10', 6.8 ), ( 1001, '2020-09-17', 5.5 ), ( 4004, '2020-09-24', 8.4 ), ( 2002, '2020-10-08', Nil ), ( 1001, Nil , 6.6 ), ( 3003, '2020-11-12', Nil ), ( 4004, '2020-11-05', 7.0 ), ( 1001, '2020-11-19', 5.3 ),
my %v = @visits.classify: *.<PATIENT_ID>;
my @result = gather for @names -> %n {
my @p = %v{ %n.<PATIENT_ID> }<>;
my @dates = @p».<VISIT_DATE>.grep: *.defined; my @scores = @p».< SCORE>.grep: *.defined;
take { %n, LAST_VISIT => ( @dates.max if @dates ), SCORE_AVG => ( @scores.sum/@scores if @scores ), SCORE_SUM => ( @scores.sum if @scores ), };
}
my @out_field_names = <PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG>; my @rows = @result.sort(*.<PATIENT_ID>).map(*.{@out_field_names}); say .map({$_ // }).fmt('%-10s', ' | ') for @out_field_names, |@rows;</lang>
- Output:
PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 3003 | Kemeny | 2020-11-12 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 5005 | Kurtz | | |
REXX
<lang rexx>/* REXX */ patients='patients.csv' l=linein(patients) Parse Var l h1 ',' h2 n=0 idl= Do n=1 By 1 While lines(patients)>0
l=linein(patients) Parse Var l id ',' lastname.id idl=idl id End
n=n-1 /* number of patients */ visits='visits.csv' l=linein(visits) /* skip the header line of this file */ h3='LAST_VISIT' h4='SCORE_SUM' h5='SCORE_AVG' date.= score.=0 Say '|' h1 '|' h2 '|' h3 '|' h4 '|' h5 '|' Do While lines(visits)>0
l=linein(visits) Parse Var l id ',' date ',' score if date>date.id Then date.id=date If score> Then Do z=score.id.0+1 score.id.z=score score.id.0=z End end
idl=wordsort(idl) Do While idl<>
Parse Var idl id idl If date.id= Then date.id=copies(' ',10) ol='|' left(id,length(h1)) '|' left(lastname.id,length(h2)), '|' left(date.id,length(h3)) If score.id.0=0 Then Do ol=ol '|' left(' ',length(h4)) '|', left(' ',length(h5)) '|' score_sum=copies(' ',length(h4)) score_avg=copies(' ',length(h4)) End Else Do score_sum=0 Do j=1 To score.id.0 score_sum=score_sum+score.id.j End score_avg=score_sum/score.id.0 ol=ol '|' left(format(score_sum,2,1),length(h4)) '|', left(format(score_avg,2,1),length(h5)) '|' End Say ol End
Exit
wordsort: Procedure /**********************************************************************
- Sort the list of words supplied as argument. Return the sorted list
- /
Parse Arg wl wa.= wa.0=0 Do While wl<> Parse Var wl w wl Do i=1 To wa.0 If wa.i>w Then Leave End If i<=wa.0 Then Do Do j=wa.0 To i By -1 ii=j+1 wa.ii=wa.j End End wa.i=w wa.0=wa.0+1 End swl= Do i=1 To wa.0 swl=swl wa.i End Return strip(swl)</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
SPSS
Note: the PATIENT_ID 5005 is missing from the result, because MATCH FILES cannot add rows for missing keys, i.e. AFAIK a "left join" is not possible is SPSS.
<lang SPSS>* set working directory to location of .csv files
CD 'C:\Temp\csv\'.
- load patients csv data
GET DATA /TYPE=TXT
/FILE="patients.csv" /ENCODING='UTF8' /DELCASE=LINE /DELIMITERS="," /QUALIFIER='"' /ARRANGEMENT=DELIMITED /FIRSTCASE=2 /IMPORTCASE=ALL /VARIABLES=
PATIENT_ID F5.0 LASTNAME A20 . CACHE. EXECUTE.
- sort cases is needed to match files
SORT CASES BY PATIENT_ID (A). DATASET NAME Patients WINDOW=FRONT.
- load visits csv data
GET DATA /TYPE=TXT
/FILE="visit.csv" /ENCODING='UTF8' /DELCASE=LINE /DELIMITERS="," /QUALIFIER='"' /ARRANGEMENT=DELIMITED /FIRSTCASE=2 /IMPORTCASE=ALL /VARIABLES=
PATIENT_ID F5.0 VISIT_DATE SDATE10 SCORE F4.1 . CACHE. EXECUTE.
- sort cases is needed, else match files will raise error "Files out of order"
SORT CASES BY PATIENT_ID (A) VISIT_DATE (A). DATASET NAME Visits WINDOW=FRONT.
- load visits csv data
- merge datasets, one to many, FILE is the 'one', TABLE is 'many'
MATCH FILES TABLE = Patients / FILE = Visits /BY PATIENT_ID. EXECUTE.
- aggregate visit date and scores, group by and order (A)=ascending or (D)=descending
AGGREGATE OUTFILE * /BREAK=PATIENT_ID(A) /last_visit = MAX(VISIT_DATE) /score_avg = MEAN(SCORE) /score_sum = SUM(SCORE).</lang>
- Output:
| PATIENT_ID | LASTNAME | last_visit | score_sum | score_avg | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | . | . | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 |
SQL
<lang SQL>-- drop tables DROP TABLE IF EXISTS tmp_patients; DROP TABLE IF EXISTS tmp_visits;
-- create tables CREATE TABLE tmp_patients( PATIENT_ID INT, LASTNAME VARCHAR(20) );
CREATE TABLE tmp_visits( PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1) );
-- load data from csv files /* -- Note: LOAD DATA LOCAL requires `local-infile` enabled on both the client and server else you get error "#1148 command is not allowed.." LOAD DATA LOCAL INFILE '/home/csv/patients.csv' INTO TABLE `tmp_patients` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; LOAD DATA LOCAL INFILE '/home/csv/visits.csv' INTO TABLE `tmp_visits` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;
- /
-- load data hard coded INSERT INTO tmp_patients(PATIENT_ID, LASTNAME) VALUES (1001, 'Hopper'), (4004, 'Wirth'), (3003, 'Kemeny'), (2002, 'Gosling'), (5005, 'Kurtz');
INSERT INTO tmp_visits(PATIENT_ID, VISIT_DATE, SCORE) VALUES (2002, '2020-09-10', 6.8), (1001, '2020-09-17', 5.5), (4004, '2020-09-24', 8.4), (2002, '2020-10-08', NULL), (1001, NULL, 6.6), (3003, '2020-11-12', NULL), (4004, '2020-11-05', 7.0), (1001, '2020-11-19', 5.3);
-- join tables and group SELECT p.PATIENT_ID, p.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM tmp_patients p LEFT JOIN tmp_visits v ON v.PATIENT_ID = p.PATIENT_ID GROUP BY p.PATIENT_ID, p.LASTNAME ORDER BY p.PATIENT_ID;</lang>
- Output:
PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG 1001 Hopper 2020-11-19 17.4 5.80 2002 Gosling 2020-10-08 6.8 6.80 3003 Kemeny 2020-11-12 NULL NULL 4004 Wirth 2020-11-05 15.4 7.70 5005 Kurtz NULL NULL NULL
Wren
<lang ecmascript>import "/fmt" for Fmt import "/sort" for Sort
class Patient {
construct new(id, lastName) { _id = id _lastName = lastName if (!__dir) __dir = {} __dir[id] = lastName if (!__ids) { __ids = [id] } else { __ids.add(id) Sort.insertion(__ids) } }
id { _id } lastName { _lastName }
// maps an id to a lastname static dir { __dir }
// maintains a sorted list of ids static ids { __ids }
}
class Visit {
construct new(id, date, score) { _id = id _date = date || "0000-00-00" _score = score if (!__dir) __dir = {} if (!__dir[id]) { __dir[id] = [ [_date], [score] ] } else { __dir[id][0].add(_date) __dir[id][1].add(score) } }
id { _id } date { _date } score { _score }
// maps an id to lists of dates and scores static dir { __dir }
}
class Merge {
construct new(id) { _id = id }
id { _id } lastName { Patient.dir[_id] } dates { Visit.dir[_id][0] } scores { Visit.dir[_id][1] } lastVisit { Sort.merge(dates)[-1] }
scoreSum { scores.reduce(0) { |acc, s| s ? acc + s : acc } } scoreAvg { scoreSum / scores.count { |s| s } }
static print(merges) { System.print("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |") var fmt = "| $d | $-7s | $s | $4s | $4s |" for (m in merges) { if (Visit.dir[m.id]) { var lv = (m.lastVisit != "0000-00-00") ? m.lastVisit : " " var ss = (m.scoreSum > 0) ? Fmt.f(4, m.scoreSum, 1) : " " var sa = (!m.scoreAvg.isNan) ? Fmt.f(4, m.scoreAvg, 2) : " " Fmt.print(fmt, m.id, m.lastName, lv, ss, sa) } else { Fmt.print(fmt, m.id, m.lastName, " ", " ", " ") } } }
}
Patient.new(1001, "Hopper") Patient.new(4004, "Wirth") Patient.new(3003, "Kemeny") Patient.new(2002, "Gosling") Patient.new(5005, "Kurtz")
Visit.new(2002, "2020-09-10", 6.8) Visit.new(1001, "2020-09-17", 5.5) Visit.new(4004, "2020-09-24", 8.4) Visit.new(2002, "2020-10-08", null) Visit.new(1001, null , 6.6) Visit.new(3003, "2020-11-12", null) Visit.new(4004, "2020-11-05", 7.0) Visit.new(1001, "2020-11-19", 5.3)
var merges = Patient.ids.map { |id| Merge.new(id) }.toList Merge.print(merges)</lang>
- Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |