Merge and aggregate datasets: Difference between revisions
Content added Content deleted
(→{{header|Mercury}}: simplify bag_max) |
|||
Line 582: | Line 582: | ||
| 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | |
| 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | |
||
| 5005 | Kurtz | | | | |
| 5005 | Kurtz | | | | |
||
</pre> |
|||
=={{header|jq}}== |
|||
{{works with|jq}} |
|||
'''Works with gojq, the Go implementation of jq''' |
|||
In the context of jq, a relational dataset "table" is naturally represented as an array of JSON objects, each representing a row in the table. |
|||
When displaying such a table, we will only show the constituent rows. |
|||
Null values will be represented by JSON's `null`. |
|||
'''Ingesting CSV data''' |
|||
<lang jq> |
|||
# objectify/1 takes an array of atomic values as inputs, and packages |
|||
# these into an object with keys specified by the "headers" array and |
|||
# values obtained by trimming string values, replacing empty strings |
|||
# by null, and converting strings to numbers if possible. |
|||
def objectify(headers): |
|||
def tonumberq: tonumber? // .; |
|||
def trimq: if type == "string" then sub("^ +";"") | sub(" +$";"") else . end; |
|||
def tonullq: if . == "" then null else . end; |
|||
. as $in |
|||
| reduce range(0; headers|length) as $i |
|||
({}; .[headers[$i]] = ($in[$i] | trimq | tonumberq | tonullq) ); |
|||
def csv2jsonHelper: |
|||
.[0] as $headers |
|||
| reduce (.[1:][] | select(length > 0) ) as $row |
|||
([]; . + [ $row|objectify($headers) ]); |
|||
</lang> |
|||
'''Aggregation functions''' |
|||
<lang jq> |
|||
# output {LAST_VISIT} |
|||
def LAST_VISIT($patient_id): |
|||
{LAST_VISIT: (map(select( .PATIENT_ID == $patient_id).VISIT_DATE) | max)}; |
|||
# output {SCORE_SUM, SCORE_AVG} |
|||
def SCORE_SUMMARY($patient_id): |
|||
map(select( .PATIENT_ID == $patient_id).SCORE) |
|||
| {SCORE_SUM: add, count: length} |
|||
| {SCORE_SUM, SCORE_AVG: (if .SCORE_SUM and .count > 0 then .SCORE_SUM/.count else null end)}; |
|||
'''The task''' |
|||
# Read the two tables: |
|||
INDEX($patients | [splits("\n")] | map(split(",")) | csv2jsonHelper[]; .PATIENT_ID) as $patients |
|||
| ($visits | [splits("\n")] | map(split(",")) | csv2jsonHelper) as $visits |
|||
# Construct the new table: |
|||
| $visits |
|||
| map(.PATIENT_ID as $PATIENT_ID |
|||
| {$PATIENT_ID} + |
|||
($visits | {LASTNAME: $patients[$PATIENT_ID|tostring]} + LAST_VISIT($PATIENT_ID) + SCORE_SUMMARY($PATIENT_ID))) |
|||
# ... but display it as a sequence of JSON objects |
|||
| .[] |
|||
</lang> |
|||
'''Invocation''' |
|||
jq -Rnc --rawfile patients patients.csv --rawfile visits visits.csv -f program.jq |
|||
{{out}} |
|||
<pre> |
|||
{"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4} |
|||
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} |
|||
{"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7} |
|||
{"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4} |
|||
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} |
|||
{"PATIENT_ID":3003,"LASTNAME":{"PATIENT_ID":3003,"LASTNAME":"Kemeny"},"LAST_VISIT":"2020-11-12","SCORE_SUM":null,"SCORE_AVG":null} |
|||
{"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7} |
|||
{"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} |
|||
</pre> |
</pre> |
||