Convert CSV records to TSV: Difference between revisions

m
→‎{{header|Wren}}: Changed to Wren S/H
(removed the bare 'r' from the field content declarations -- it seems irrelevant and unnecessary, and is not mentioned at https://en.wikipedia.org/wiki/Parsing_expression_grammar#Syntax)
m (→‎{{header|Wren}}: Changed to Wren S/H)
 
(30 intermediate revisions by 6 users not shown)
Line 1:
{{draft task}}
The present task is focused on the conversion of strings of comma-separated
values (CSV) to strings of tab-separated values (TSV) subject to the specific
Line 7 ⟶ 8:
the operating system.
 
In accordance with the so-called robustness principle of processing, if a string or
input line begins with a valid (possibly empty) sequence of comma-separated
values but is followed by nonsense, then processing should continue based on
Line 13 ⟶ 14:
 
; A CSV record
[[Category:PEG]]
Our starting point will be a character set that includes ASCII; the language
of regular expressions (which will be denoted by strings of the form
r'REGEX'); and the following [[:Category:PEG|PEG]] (parsing expression grammar) grammar for a
single CSV record:
<pre>
Line 23 ⟶ 25:
ws = ' '
 
unquoted_field := r'[^,"]*'
 
quoted_field_content := r'(("")|([^"]))*'
</pre>
Since it has already been stipulated that partitioning of a file into
Line 32 ⟶ 34:
so there is no need to express that as a restriction in the PEG.
 
; "Robustness"
As mentioned above, if an input line or string begins with a valid sequence of
comma-separated values but is followed by nonsense, then processing should
Line 90 ⟶ 92:
the platform.
 
Use the following as a test file, noting(after changing the RETURN string to the RETURN control character) and note any discrepancies with the
requirements,; andin specificallyaddition, statingindicate how strings containing backslashes
(besides those preceding r'[nrt]) are transformed.
 
; TEST FILE
The test file should include a line with a RETURN ('\r') character.
One way to change the first RETURN in the following to the '\r' control character would be
to use `sed`, perhaps along the lines of: sed -i.bak $'s/RETURN/\r/' TEST.csv
<pre>
a,"b"
Line 107 ⟶ 112:
a\n\rb
a�b, That is a NUL character
aRETURNb, Be sure to change RETURN to the '\r' control character (#xd)
a
b, That is a LF (linefeed) character
a\b
</pre>
 
[[category:CSV]]
[[category:TSV]]
 
=={{header|ALGOL 68}}==
All input \ characters are doubled in the output.<br>
As with some of the other samples, the input data is stored in an array, not read from a file.
<syntaxhighlight lang="algol68">
BEGIN # convert lines of CSV to TSV #
CHAR nul = REPR 0;
CHAR tab = REPR 9;
CHAR lf = REPR 10;
CHAR cr = REPR 13;
# returns s with some control characters converted to <name> #
PROC show ctrl = ( STRING s )STRING:
BEGIN
STRING result := "";
FOR i FROM LWB s TO UPB s DO
result +:= IF s[ i ] = nul THEN "<nul>"
ELIF s[ i ] = tab THEN "<tab>"
ELIF s[ i ] = cr THEN "<cr>"
ELIF s[ i ] = lf THEN "<lf>"
ELSE s[ i ]
FI
OD;
result
END # show ctrl # ;
# returns csv converted to TSV #
PROC csv2tsv = ( STRING csv )STRING:
BEGIN
BOOL at end := FALSE;
CHAR ch := nul;
# sets ch to the next character in csv, if there is one #
PROC next = VOID: ch := IF c pos < c max
THEN csv[ c pos +:= 1 ]
ELSE at end := TRUE
; nul
FI;
# skips over spaces and returns the count of skipped spaces #
PROC spaces = INT:
BEGIN
INT s count := 0;
WHILE NOT at end AND ch = " " DO s count +:= 1; next OD;
s count
END # spaces # ;
# adds ch to the TSV - converting some characters to escaped form #
PROC add = VOID: tsv +:= IF ch = "\" THEN "\\"
ELIF ch = nul THEN "\0"
ELIF ch = cr THEN "\r"
ELIF ch = lf THEN "\n"
ELIF ch = tab THEN "\t"
ELSE ch
FI;
# parse the csv and generate the tsv #
STRING tsv := "";
INT c pos := LWB csv - 1;
INT c max = UPB csv;
WHILE NOT at end DO
# spaces are not significant around quoted fields but are part of unquoted fields #
next;
INT space count := spaces;
IF ch = """" THEN
# quoted field part #
WHILE next;
WHILE NOT at end AND ch /= """" DO add; next OD;
IF NOT at end THEN
next;
IF ch = """" THEN
# embedded quote #
add
FI
FI;
NOT at end AND ch = """"
DO SKIP OD;
space count := spaces;
IF at end OR ch = "," THEN
# nothing significant after the quoted field part #
space count := 0
FI
FI;
# unquoted field part #
tsv +:= space count * " ";
WHILE NOT at end AND ch /= "," DO add; next OD;
IF ch = "," THEN
# have another field following this one #
tsv +:= tab
FI
OD;
tsv
END # csv2tsv # ;
# task test cases #
[]STRING tests =
( "a,""b"""
, """a"",""b""""c"""
, ""
, ",a"
, "a,"""
, " a , ""b"""
, """12"",34"
, "a" + tab + "b, That is a TAB character"
, "a\tb"
, "a\n\rb"
, "a" + nul + "b, That is a NUL character"
, "a" + cr + "b, Be sure to change RETURN to the '\r' control character (#xd)"
, "a\b"
);
FOR i FROM LWB tests TO UPB tests DO
print( ( " {{", show ctrl( tests[ i ] ), "}}", newline
, " -> {{", show ctrl( csv2tsv( tests[ i ] ) ), "}}", newline
)
)
OD
END
</syntaxhighlight>
{{out}}
<pre>
{{a,"b"}}
-> {{a<tab>b}}
{{"a","b""c"}}
-> {{a<tab>b"c}}
{{}}
-> {{}}
{{,a}}
-> {{<tab>a}}
{{a,"}}
-> {{a<tab>}}
{{ a , "b"}}
-> {{ a <tab>b}}
{{"12",34}}
-> {{12<tab>34}}
{{a<tab>b, That is a TAB character}}
-> {{a\tb<tab> That is a TAB character}}
{{a\tb}}
-> {{a\\tb}}
{{a\n\rb}}
-> {{a\\n\\rb}}
{{a<nul>b, That is a NUL character}}
-> {{a\0b<tab> That is a NUL character}}
{{a<cr>b, Be sure to change RETURN to the '\r' control character (#xd)}}
-> {{a\rb<tab> Be sure to change RETURN to the '\\r' control character (#xd)}}
{{a\b}}
-> {{a\\b}}
</pre>
 
=={{header|J}}==
Implementation:
<syntaxhighlight lang=J>tokenize=: (0;(0 10#:10*do@>cutLF{{)n
1.1 1.1 1.1 1.1 1.1 NB. 0 start here
1.2 2.0 3.2 1.2 1.0 NB. 1 , or CR or LF starts a new "token"
2 4 2 2 2 NB. 2 quote toggles "quoted field mode"
1.2 2.0 1.0 2.2 1.0 NB. 3 CR,LF: 1 "token" CR,CR and CR,',': 2 "tokens"
1.2 2.0 3.2 1.2 5.3 NB. 4 closing quote must be followed by a delimiter
5 5 1.1 1.1 5 NB. 5 resync on newline after encountering nonsense
}});(;/',"',LF,CR);0 _1 0 _1) ;: LF,]
NB. , " CR LF ...
 
unquote=: {{
txt=. (1+CRLF-:2{.y)}.y NB. discard leading ',', LF, CR or CRLF
if. '"'={.txt do.
rplc&('""';'"') }. txt}.~-'"'={:txt
else.
txt
end.
}}
 
NUL=: 0{a.
 
canonical=: {{ NB. read if file and drop optional trailing newline
txt=. fread^:L. y
txt}.~-(CRLF-:_2{.txt)+({:txt)e.CRLF
}}
 
csv2mat=: (e.&CRLF@{.@> unquote each;.1 ])@tokenize@canonical
 
escape=: rplc&('\';'\\';NUL;'\0';TAB;'\t';LF;'\n';CR;'\r')@":
 
mat2tsv=: {{ ;LF,~each ([,TAB,])each/"1 escape each y }}
 
csv2tsv=: mat2tsv@csv2mat</syntaxhighlight>
 
In current J contexts, LF is always a newline character. CR and CRLF are considered obsolete. Here we support CR and CRLF as newline delimiters when reading, but standardize on LF when writing.
 
Also, we escape all backslashes, regardless of what character follows, during tsv encoding.
 
Here, also, we interpret "nonsense" as starting immediately after a closing quote which is not followed by a delimiter, and ending immediately before the most immediately following newline.
 
For csv parsing we first break out fields using <b><tt>[[j:Vocabulary/semico#dyadic|;:]]</tt></b>. Here, each field is preceded by a delimiter. (We discard an optional trailing newline from the csv text and prepend a newline at the beginning so that every field has a preceding delimiter. Also, of course, if we were given a file reference, we work with the text of the file rather than its name.)
 
Then, these fields are formed into rows (fields which began immediately following newlines start a new row), and each field is stripped of delimiters and non-textual quotes are removed. The result is a two-dimensional matrix: the result of csv2mat.
 
To translate to tsv form, we would first escape special characters in each field, then insert delimiters between each field and terminate each record with a newline. Thus, mat2tsv takes a two-dimensional matrix and its result is a tsv string. (For utility, mat2tsv also supports numeric matrices, since that was trivial.)
 
Task example:
<syntaxhighlight lang=J> csv2tsv test
a b
a b"c
\n a
a \n a ,
12 34
a\tb That is a TAB character
a\\tb
a\\n\\rb
a\0b That is a NUL character
a
b That is a LF (linefeed) character
a\\b
</syntaxhighlight>
 
=={{header|jq}}==
Line 116 ⟶ 327:
 
'''The following program can also be used with gojq, the Go implementation of jq,
but until recently NUL (#x0) iswas left unaltered.'''
 
In this entry, the PEG grammar for "record" as defined in the task
Line 122 ⟶ 333:
closely to jq operators, notably PEG's '/' to jq's '//'.
 
In translating the PEG grammar to a jq program, the main idea is to define a
pipeline for each grammar rule. A JSON object with keys "remainder" and
"result" is passed through this pipeline, consuming the text in .remainder and
Line 160 ⟶ 371:
 
def record: field | star(consume(",") | field);
 
def parse: {remainder: .} | record | .result;
 
def csv2tsv:
{remainder: .} | record | .result | @tsv ;
parse
| @tsv ;
 
# Transform an entire file: assuming jq is invoked with the -n option
inputs | csv2tsv
</syntaxhighlight>
{{output}}
As required:
* Backquotes are uniformly duplicated.
* Backslashes are uniformly duplicated.
* gojq does not, and currently cannot, handle NUL (#x0) properly.
* Until recently gojq did not handle NUL (#x0) properly.
 
=={{header|Julia}}==
{{trans|Phix}}
<syntaxhighlight lang="julia">function csv_tsv(str)
p = split(str, ",")
for (i, f) in enumerate(p)
if count(==('"'), f) > 1
p[i] = replace(strip(f, [' ', '"']), "\"\"" => "\"")
elseif f == "\""
p[i] = ""
end
end
t = join(p,"<TAB>")
s = replace(str, "\\" => "\\\\", "\t" => "\\t", "\0" => "\\0", "\n" => "\\n", "\r" => "\\r")
t = replace(t, "\\" => "\\\\", "\t" => "\\t", "\0" => "\\0", "\n" => "\\n", "\r" => "\\r")
return s, t
end
 
const testfile = "test.tmp"
fh = open(testfile, "w")
 
write(fh, """
a,"b"
"a","b""c"
 
,a
a,"
a , "b"
"12",34
a\tb, TAB
a\\tb
a\\n\\rb
a\0b, NUL
a\rb, RETURN
a\\b""")
 
close(fh)
 
for test_string in split(read(testfile, String), "\n")
csv, tsv = csv_tsv(test_string)
println(lpad(csv, 12), " => ", tsv)
end
</syntaxhighlight>{{out}}
<pre>
a,"b" => a<TAB>b
"a","b""c" => a<TAB>b"c
=>
,a => <TAB>a
a," => a<TAB>
a , "b" => a <TAB>b
"12",34 => 12<TAB>34
a\tb, TAB => a\tb<TAB> TAB
a\\tb => a\\tb
a\\n\\rb => a\\n\\rb
a\0b, NUL => a\0b<TAB> NUL
a\rb, RETURN => a\rb<TAB> RETURN
a\\b => a\\b
</pre>
 
=={{header|Phix}}==
Note that backtick strings are not escaped, whereas double-quoted strings are.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">csv2tsv</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">p</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #008000;">','</span><span style="color: #0000FF;">,</span><span style="color: #004600;">false</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #000000;">f</span> <span style="color: #008080;">in</span> <span style="color: #000000;">p</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">find_all</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'"'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">f</span><span style="color: #0000FF;">))></span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">p</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #000000;">f</span><span style="color: #0000FF;">,</span><span style="color: #008000;">` "`</span><span style="color: #0000FF;">),</span><span style="color: #008000;">`""`</span><span style="color: #0000FF;">,</span><span style="color: #008000;">`"`</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">f</span><span style="color: #0000FF;">=</span><span style="color: #008000;">`"`</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">p</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">t</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join</span><span style="color: #0000FF;">(</span><span style="color: #000000;">p</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"&lt;tab&gt;"</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute_all</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,{</span><span style="color: #008000;">`\`</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\t"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\0'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">},{</span><span style="color: #008000;">`\\`</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\t"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\0"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">})</span>
<span style="color: #000000;">t</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute_all</span><span style="color: #0000FF;">(</span><span style="color: #000000;">t</span><span style="color: #0000FF;">,{</span><span style="color: #008000;">`\`</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\t"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\0'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">},{</span><span style="color: #008000;">`\\`</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\t"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\0"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">})</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%12s =&gt; %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #000000;">t</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">test_data</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">`a,"b"`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">`"a","b""c"`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">``</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">`,a`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">`a,"`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">` a , "b"`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">`"12",34`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">"a\tb,"</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- That is a TAB character</span>
<span style="color: #008000;">`a\tb`</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- That is not</span>
<span style="color: #008000;">`a\n\rb`</span><span style="color: #0000FF;">,</span>
<span style="color: #008000;">"a\0b"</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- That is a NUL character</span>
<span style="color: #008000;">"a\nb"</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- That is a LF (linefeed) character</span>
<span style="color: #008000;">`a\b`</span><span style="color: #0000FF;">}</span>
<span style="color: #7060A8;">papply</span><span style="color: #0000FF;">(</span><span style="color: #000000;">test_data</span><span style="color: #0000FF;">,</span><span style="color: #000000;">csv2tsv</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
a,"b" => a<tab>b
"a","b""c" => a<tab>b"c
=>
,a => <tab>a
a," => a<tab>
a , "b" => a <tab>b
"12",34 => 12<tab>34
a\tb, => a\tb<tab>
a\\tb => a\\tb
a\\n\\rb => a\\n\\rb
a\0b => a\0b
a\nb => a\nb
a\\b => a\\b
</pre>
 
=={{header|Wren}}==
{{libheader|Wren-ioutil}}
{{libheader|Wren-str}}
Backslashes are only duplicated for escaped \t, \n and \r.
<syntaxhighlight lang="wren">import "./ioutil" for FileUtil
import "./str" for Str
 
// These could be given as command line arguments
// but we hard-code them for the purposes of this task.
var fileName1 = "test.csv"
var fileName2 = "test.tsv"
 
// This next line assumes a line break of "\r\n" for Windows or "\n" otherwise.
var lines = FileUtil.readLines(fileName1)
 
// Get rid of an empty last line if there is one.
if (lines.count > 1 && lines[-1] == "") lines = lines[0...-1]
 
var lc = lines.count
 
// Normalize fields before rejoining with \t.
for (i in 0...lc) {
// Str.splitCSv treats quoted fields by default as unquoted if there's any
// leading whitespace but we can't do that here.
var fields = Str.splitCsv(lines[i], ",", false)
for (i in 0...fields.count) {
var numQuotes = fields[i].count { |c| c == "\"" }
// Treat it as a quoted field for this task if there's at least two quotes
// and then remove any surrounding whitespace and the outer quotes.
if (numQuotes > 1) fields[i] = fields[i].trim().trim("\"")
 
fields[i] = fields[i].replace("\"\"", "\"")
.replace("\\t", "\\\\t")
.replace("\\r", "\\\\r")
.replace("\\n", "\\\\n")
.replace("\t", "\\t")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\0", "\\0")
}
// Not sure how 'nonsense' is defined but for now blank
// the final field if it contains nothing but quotes.
if (fields[-1].count > 0 && fields[-1].all { |c| c == "\"" }) fields[-1] = ""
lines[i] = fields.join("\t")
}
 
// Write lines (as amended) to fileName2.
FileUtil.writeLines(fileName2, lines)
 
// Write contents of fileName2 to the terminal with tabs replaced by <tab> and
// with each line surrounded with single quotes.
lines = FileUtil.readLines(fileName2)[0...-1]
.map { |line| "'%(line)'".replace("\t", "<tab>") }
.join("\n")
System.print(lines)</syntaxhighlight>
 
{{out}}
<pre>
'a<tab>b'
'a<tab>b"c'
''
'<tab>a'
'a<tab>'
' a <tab>b'
'12<tab>34'
'a\tb<tab> That is a TAB character'
'a\\tb'
'a\\n\\rb'
'a\0b<tab> That is a NUL character'
'a\rb<tab> That is a RETURN character'
'a\b'
</pre>
9,488

edits