Merge and aggregate datasets: Difference between revisions
Content added Content deleted
(Add Transd) |
(→{{header|Haskell}}: added solution) |
||
Line 583: | Line 583: | ||
| 5005 | Kurtz | | | | |
| 5005 | Kurtz | | | | |
||
</pre> |
</pre> |
||
=={{header|Haskell}}== |
|||
===Reading and merging=== |
|||
Merging of fields and databases is defined as a monoid operation for corresponding types. |
|||
<lang haskell>import Data.List |
|||
import Data.Maybe (mapMaybe, maybeToList) |
|||
import System.IO (readFile) |
|||
import Text.Read (readMaybe) |
|||
import Control.Applicative ((<|>)) |
|||
------------------------------------------------------------ |
|||
newtype DB = DB { entries :: [Patient] } |
|||
deriving Show |
|||
instance Semigroup DB where |
|||
DB a <> DB b = normalize $ a <> b |
|||
instance Monoid DB where |
|||
mempty = DB [] |
|||
normalize :: [Patient] -> DB |
|||
normalize = DB |
|||
. map mconcat |
|||
. groupBy (\x y -> pid x == pid y) |
|||
. sortOn pid |
|||
------------------------------------------------------------ |
|||
data Patient = Patient { pid :: String |
|||
, name :: Maybe String |
|||
, visits :: [String] |
|||
, scores :: [Float] } |
|||
deriving Show |
|||
instance Semigroup Patient where |
|||
p1 <> p2 = Patient |
|||
(if null (pid p1) then pid p2 else pid p1) |
|||
(name p1 <|> name p2) |
|||
(visits p1 <|> visits p2) |
|||
(scores p2 <|> scores p1) |
|||
instance Monoid Patient where |
|||
mempty = Patient mempty mempty mempty mempty |
|||
------------------------------------------------------------ |
|||
readDB :: String -> DB |
|||
readDB = normalize |
|||
. mapMaybe readPatient |
|||
. readCSV |
|||
readPatient r = do |
|||
i <- lookup "PATIENT_ID" r |
|||
let n = lookup "LASTNAME" r |
|||
let d = lookup "VISIT_DATE" r >>= readDate |
|||
let s = lookup "SCORE" r >>= readMaybe |
|||
return $ Patient i n (maybeToList d) (maybeToList s) |
|||
where |
|||
readDate [] = Nothing |
|||
readDate d = Just d |
|||
readCSV :: String -> [(String, String)] |
|||
readCSV txt = zip header <$> body |
|||
where |
|||
header:body = splitBy ',' <$> lines txt |
|||
splitBy ch = unfoldr go |
|||
where |
|||
go [] = Nothing |
|||
go s = Just $ drop 1 <$> span (/= ch) s</lang> |
|||
<pre>let patients = readDB <$> readFile "patients.csv" |
|||
*Main> let visits = readDB <$> readFile "visits.csv" |
|||
*Main> mapM_ print . entries =<< patients |
|||
Patient {pid = "1001", name = Just "Hopper", visits = [], scores = []} |
|||
Patient {pid = "2002", name = Just "Gosling", visits = [], scores = []} |
|||
Patient {pid = "3003", name = Just "Kemeny", visits = [], scores = []} |
|||
Patient {pid = "4004", name = Just "Wirth", visits = [], scores = []} |
|||
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []} |
|||
*Main> mapM_ print . entries =<< visits |
|||
Patient {pid = "1001", name = Nothing, visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]} |
|||
Patient {pid = "2002", name = Nothing, visits = ["2020-09-10","2020-10-08"], scores = [6.8]} |
|||
Patient {pid = "3003", name = Nothing, visits = ["2020-11-12"], scores = []} |
|||
Patient {pid = "4004", name = Nothing, visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]} |
|||
*Main> mapM_ print . entries =<< patients <> visits |
|||
Patient {pid = "1001", name = Just "Hopper", visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]} |
|||
Patient {pid = "2002", name = Just "Gosling", visits = ["2020-09-10","2020-10-08"], scores = [6.8]} |
|||
Patient {pid = "3003", name = Just "Kemeny", visits = ["2020-11-12"], scores = []} |
|||
Patient {pid = "4004", name = Just "Wirth", visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]} |
|||
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}</pre> |
|||
===Pretty tabulation=== |
|||
<lang haskell> |
|||
tabulateDB (DB ps) header cols = intercalate "|" <$> body |
|||
where |
|||
body = transpose $ zipWith pad width table |
|||
table = transpose $ header : map showPatient ps |
|||
showPatient p = sequence cols p |
|||
width = maximum . map length <$> table |
|||
pad n col = (' ' :) . take (n+1) . (++ repeat ' ') <$> col |
|||
main = do |
|||
a <- readDB <$> readFile "patients.csv" |
|||
b <- readDB <$> readFile "visits.csv" |
|||
mapM_ putStrLn $ tabulateDB (a <> b) header fields |
|||
where |
|||
header = [ "PATIENT_ID", "LASTNAME", "VISIT_DATE" |
|||
, "SCORES SUM","SCORES AVG"] |
|||
fields = [ pid |
|||
, \p -> case name p of {Nothing -> []; Just n -> n} |
|||
, \p -> case visits p of {[] -> []; l -> last l} |
|||
, \p -> case scores p of {[] -> []; s -> show (sum s)} |
|||
, \p -> case scores p of {[] -> []; s -> show (mean s)} ] |
|||
mean lst = sum lst / genericLength lst</lang> |
|||
<pre>*Main> main |
|||
PATIENT_ID | LASTNAME | VISIT_DATE | SCORES SUM | SCORES AVG |
|||
1001 | Hopper | 2020-11-19 | 17.4 | 5.7999997 |
|||
2002 | Gosling | 2020-10-08 | 6.8 | 6.8 |
|||
3003 | Kemeny | 2020-11-12 | | |
|||
4004 | Wirth | 2020-11-05 | 15.4 | 7.7 |
|||
5005 | Kurtz | | | </pre> |
|||
=={{header|jq}}== |
=={{header|jq}}== |