Text processing/2: Difference between revisions

Content added Content deleted

Inline

@@ Line 1,340: / Line 1,340: @@
 var analyze = analyze_func('readings.txt');
 analyze();</lang>
+=={{header|jq}}==
+{{works with|jq|with regex support}}
+For this problem, it is convenient to use jq in a pipeline: the first invocation of jq will convert the text file into a stream of JSON arrays (one array per line):
+<lang sh>$ jq -R '[splits("[ \t]+")]' Text_processing_2.txt</lang>
+The second part of the pipeline performs the task requirements. The following program is used in the second invocation of jq.
+'''Generic Utilities'''
+<lang jq># Given any array, produce an array of [item, count] pairs for each run.
+def runs:
+  reduce .[] as $item
+    ( [];
+      if . == [] then [ [ $item, 1] ]
+      else  .[length-1] as $last
+            | if $last[0] == $item then (.[0:length-1] + [ [$item, $last[1] + 1] ] )
+              else . + [[$item, 1]]
+              end
+      end ) ;
+def is_float: test("^[-+]?[0-9]*[.][0-9]*([eE][-+]?[0-9]+)?$");
+def is_integral: test("^[-+]?[0-9]+$");
+def is_date: test("[12][0-9]{3}-[0-9][0-9]-[0-9][0-9]");</lang>
+'''Validation''':
+<lang jq># Report line and column numbers using conventional numbering (IO=1).
+def validate_line(nr):
+  def validate_date:
+    if is_date then empty else "field 1 in line \(nr) has an invalid date: \(.)" end;
+  def validate_length(n):
+    if length == n then empty else "line \(nr) has \(length) fields" end;
+  def validate_pair(i):
+    ( .[2*i + 1] as $n
+      | if ($n | is_float) then empty else "field \(2*i + 2) in line \(nr) is not a float: \($n)" end),
+    ( .[2*i + 2] as $n
+      | if ($n | is_integral) then empty else "field \(2*i + 3) in line \(nr) is not an integer: \($n)" end);
+  (.[0] | validate_date),
+  (validate_length(49)),
+  (range(0; (length-1) / 2) as $i | validate_pair($i)) ;
+def validate_lines:
+ . as $in
+ | range(0; length) as $i | ($in[$i] | validate_line($i + 1));</lang>
+'''Check for duplicate timestamps'''
+<lang jq>def duplicate_timestamps:
+  [.[][0]] | sort | runs | map( select(.[1]>1) );</lang>
+'''Number of valid readings for all instruments''':
+<lang jq># The following ignores any issues with respect to duplicate dates,
+# but does check the validity of the record, including the date format:
+def number_of_valid_readings:
+  def check:
+    . as $in
+    | (.[0] | is_date)
+      and length == 49
+      and all(range(0; 24) | $in[2*. + 1] | is_float)
+      and all(range(0; 24) | $in[2*. + 2] | (is_integral and tonumber >= 1) );
+   map(select(check)) | length ;</lang>
+'''Generate Report'''
+<lang jq>validate_lines,
+"\nChecking for duplicate timestamps:",
+duplicate_timestamps,
+"\nThere are \(number_of_valid_readings) valid rows altogether."</lang>
+{{out}}
+'''Part 1: Simple demonstration'''
+To illustrate that the program does report invalid lines, we first use the six lines at the top but mangle the last line.
+<lang sh>$ jq -R  '[splits("[ \t]+")]' Text_processing_2.txt | jq -s -r -f  Text_processing_2.jq
+field 1 in line 6 has an invalid date: 991-04-03
+line 6 has 47 fields
+field 2 in line 6 is not a float: 10000
+field 3 in line 6 is not an integer: 1.0
+field 47 in line 6 is not an integer: x
+Checking for duplicate timestamps:
+[
+  [
+    "1991-03-31",
+  ]
+]
+There are 5 valid rows altogether.</lang>
+'''Part 2: readings.txt'''
+<lang sh>$ jq -R  '[splits("[ \t]+")]' readings.txt | jq -s -r -f  Text_processing_2.jq
+Checking for duplicate timestamps:
+[
+  [
+    "1990-03-25",
+  ],
+  [
+    "1991-03-31",
+  ],
+  [
+    "1992-03-29",
+  ],
+  [
+    "1993-03-28",
+  ],
+  [
+    "1995-03-26",
+  ]
+]
+There are 5017 valid rows altogether.</lang>
 =={{header|Lua}}==
 <lang lua>filename = "readings.txt"