Merge and aggregate datasets: Difference between revisions

Content added Content deleted

Inline

@@ Line 583: / Line 583: @@
 | 5005       | Kurtz    |            |           |           |
 </pre>
+=={{header|Haskell}}==
+===Reading and merging===
+Merging of fields and databases is defined as a monoid operation for corresponding types.
+<lang haskell>import Data.List
+import Data.Maybe (mapMaybe, maybeToList)
+import System.IO (readFile)
+import Text.Read (readMaybe)
+import Control.Applicative ((<|>))
+------------------------------------------------------------
+newtype DB = DB { entries :: [Patient] }
+  deriving Show
+instance Semigroup DB where
+  DB a <> DB b = normalize $ a <> b
+instance Monoid DB where
+  mempty = DB []
+normalize :: [Patient] -> DB
+normalize = DB
+            . map mconcat
+            . groupBy (\x y -> pid x == pid y)
+            . sortOn pid
+------------------------------------------------------------
+data Patient = Patient { pid :: String
+                       , name :: Maybe String
+                       , visits :: [String]
+                       , scores :: [Float] }
+  deriving Show
+instance Semigroup Patient where
+  p1 <> p2 = Patient
+    (if null (pid p1) then pid p2 else pid p1)
+    (name p1 <|> name p2)
+    (visits p1 <|> visits p2)
+    (scores p2 <|> scores p1)
+instance Monoid Patient where
+  mempty = Patient mempty mempty mempty mempty
+------------------------------------------------------------
+readDB :: String  -> DB
+readDB = normalize
+         . mapMaybe readPatient
+         . readCSV
+readPatient r = do
+  i <- lookup "PATIENT_ID" r
+  let n = lookup "LASTNAME" r
+  let d = lookup "VISIT_DATE" r >>= readDate
+  let s = lookup "SCORE" r >>= readMaybe
+  return $ Patient i n (maybeToList d) (maybeToList s)
+  where
+    readDate [] = Nothing
+    readDate d = Just d
+readCSV :: String -> [(String, String)]
+readCSV txt = zip header <$> body
+  where
+    header:body = splitBy ',' <$> lines txt
+    splitBy ch = unfoldr go
+      where
+        go [] = Nothing
+        go s  = Just $ drop 1 <$> span (/= ch) s</lang>
+<pre>let patients = readDB <$> readFile "patients.csv"
+*Main> let visits = readDB <$> readFile "visits.csv"
+*Main> mapM_ print . entries =<< patients
+Patient {pid = "1001", name = Just "Hopper", visits = [], scores = []}
+Patient {pid = "2002", name = Just "Gosling", visits = [], scores = []}
+Patient {pid = "3003", name = Just "Kemeny", visits = [], scores = []}
+Patient {pid = "4004", name = Just "Wirth", visits = [], scores = []}
+Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}
+*Main> mapM_ print . entries =<< visits
+Patient {pid = "1001", name = Nothing, visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
+Patient {pid = "2002", name = Nothing, visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
+Patient {pid = "3003", name = Nothing, visits = ["2020-11-12"], scores = []}
+Patient {pid = "4004", name = Nothing, visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}
+*Main> mapM_ print . entries =<< patients <> visits
+Patient {pid = "1001", name = Just "Hopper", visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
+Patient {pid = "2002", name = Just "Gosling", visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
+Patient {pid = "3003", name = Just "Kemeny", visits = ["2020-11-12"], scores = []}
+Patient {pid = "4004", name = Just "Wirth", visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}
+Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}</pre>
+===Pretty tabulation===
+<lang haskell>
+tabulateDB (DB ps) header cols = intercalate "|" <$> body
+  where
+    body = transpose $ zipWith pad width table
+    table = transpose $ header : map showPatient ps
+    showPatient p = sequence cols p
+    width = maximum . map length <$> table
+    pad n col = (' ' :) . take (n+1) . (++ repeat ' ') <$> col
+main = do
+  a <- readDB <$> readFile "patients.csv"
+  b <- readDB <$> readFile "visits.csv"
+  mapM_ putStrLn $ tabulateDB (a <> b) header fields
+  where
+    header = [ "PATIENT_ID", "LASTNAME", "VISIT_DATE"
+             , "SCORES SUM","SCORES AVG"]
+    fields = [ pid
+             , \p -> case name p of {Nothing -> []; Just n -> n}
+             , \p -> case visits p of {[] -> []; l -> last l}
+             , \p -> case scores p of {[] -> []; s -> show (sum s)}
+             , \p -> case scores p of {[] -> []; s -> show (mean s)} ]
+    mean lst = sum lst / genericLength lst</lang>
+<pre>*Main> main
+ PATIENT_ID | LASTNAME | VISIT_DATE | SCORES SUM | SCORES AVG
+| Hopper   | 2020-11-19 | 17.4       | 5.7999997
+| Gosling  | 2020-10-08 | 6.8        | 6.8
+| Kemeny   | 2020-11-12 |            |
+| Wirth    | 2020-11-05 | 15.4       | 7.7
+| Kurtz    |            |            |            </pre>
 =={{header|jq}}==