Tokenize a string with escaping

From Rosetta Code
Task
Tokenize a string with escaping
You are encouraged to solve this task according to the task description, using any language you may know.

Write a function or program that can split a string at each non-escaped occurrence of a separator character.

It should accept three input parameters:

  •   The string
  •   The separator character
  •   The escape character


It should output a list of strings.

Details

Rules for splitting:

  • The fields that were separated by the separators, become the elements of the output list.
  • Empty fields should be preserved, even at the start and end.


Rules for escaping:

  • "Escaped" means preceded by an occurrence of the escape character that is not already escaped itself.
  • When the escape character precedes a character that has no special meaning, it still counts as an escape (but does not do anything special).
  • Each occurrence of the escape character that was used to escape something, should not become part of the output.


Test case

Demonstrate that your function satisfies the following test-case:

Input Output
string:
one^|uno||three^^^^|four^^^|^cuatro|
separator character:
|
escape character:
^
one|uno

three^^
four^|cuatro

(Print the output list in any format you like, as long as it is it easy to see what the fields are.)

Other tasks related to string operations:
Metrics
Counting
Remove/replace
Anagrams/Derangements/shuffling
Find/Search/Determine
Formatting
Song lyrics/poems/Mad Libs/phrases
Tokenize
Sequences



11l[edit]

Translation of: Python
F token_with_escape(a, escape = ‘^’, separator = ‘|’)
   [String] result
   V token = ‘’
   V state = 0
   L(c) a
      I state == 0
         I c == escape
            state = 1
         E I c == separator
            result.append(token)
            token = ‘’
         E
            token ‘’= c
      E I state == 1
         token ‘’= c
         state = 0
   result.append(token)
   R result

print(token_with_escape(‘one^|uno||three^^^^|four^^^|^cuatro|’).map(s -> ‘'’s‘'’).join(‘, ’))
Output:
'one|uno', '', 'three^^', 'four^|cuatro', ''

8080 Assembly[edit]

	org	100h
	jmp	demo
	;;;	Routine to split a 0-terminated string
	;;;	Input: B=separator, C=escape, HL=string pointer.
	;;;	Output: DE=end of list of strings
	;;;	The split strings are stored in place.
split:	mov	d,h	; Set DE = output pointer
	mov	e,l
snext:	mov	a,m	; Get current input character
	inx	h	; Advance input pointer
	stax	d	; Write character at output pointer
	ana	a	; If zero, we are done
	rz
	cmp	c	; Is it the escape character?
	jz	sesc
	cmp	b	; Is it the separator character?
	jz	ssep
	inx	d	; Otherwise, advance output pointer,
	jmp	snext	; and get the next character
sesc:	mov	a,m	; Store the escaped character without
	inx	h	; checking for anything except zero.
	stax	d
	inx	d
	ana	a	; Zero is still end of string
	rz
	jmp	snext
ssep:	xra 	a	; End of string, write zero terminator
	stax	d 
	inx	d
	jmp	snext
	;;;	Use the routine to split the test-case string
demo:	mvi	b,'|'	; Separator character
	mvi	c,'^'	; Escape character
	lxi	h,test	; Pointer to test string
	call	split
	;;;	Print each string on its own line
	lxi	h,test
str:	call	puts	; Print string
	call	cmp16	; Are we there yet?
	jnc	str	; If not, print the next string
	ret
	;;;	16-bit compare
cmp16:	mov	a,d
	cmp	h
	rnz
	mov	a,e
	cmp	l
	ret
	;;;	Print zero-terminated string with newline
puts:	push	d	; Keep DE registers
	push	h	; Keep pointer
	lxi 	d,pfx	; Print prefix
	mvi	c,9
	call	5
	pop	h	; Restore pointer
ploop:	mov	e,m	; Get current character
	push	h	; Keep pointer
	mvi	c,2	; CP/M print character
	call	5
	pop	h	; Restore pointer
	mov	a,m	; Is character zero?
	ora	a
	inx	h	; Increment pointer
	jnz	ploop	; If not, there are more characters
	push	h	; Keep pointer
	lxi	d,nl	; Write newline
	mvi	c,9	; CP/M print string
	call	5
	pop	h
	pop	d	; Restore DE registers
	ret
pfx:	db	'> $'	; Prefix to make the output more obvious
nl:	db	13,10,'$'
test:	db	'one^|uno||three^^^^|four^^^|^cuatro|',0
Output:
> one|uno
>
> three^^
> four^|cuatro
>

Action![edit]

DEFINE PTR="CARD"

TYPE Tokens=[
  PTR buf   ;BYTE ARRAY
  PTR arr   ;CARD ARRAY
  PTR endPtr
  BYTE count]

PROC Init(Tokens POINTER t BYTE ARRAY b PTR ARRAY a)
  t.buf=b
  t.arr=a
  t.endPtr=b
  t.count=0
RETURN

PROC AddToken(Tokens POINTER t CHAR ARRAY s)
  PTR ARRAY a
  CHAR ARRAY tmp

  a=t.arr
  tmp=t.endPtr
  SCopy(tmp,s)
  a(t.count)=tmp
  t.count==+1
  t.endPtr=t.endPtr+s(0)+1
RETURN

PROC PrintTokens(Tokens POINTER t)
  BYTE i
  PTR ARRAY a
  
  a=t.arr
  FOR i=0 TO t.count-1
  DO
    PrintF("""%S""%E",a(i))
  OD
RETURN

PROC Append(CHAR ARRAY s CHAR c)
  s(0)==+1
  s(s(0))=c
RETURN

PROC Tokenize(CHAR ARRAY s CHAR sep,esc Tokens POINTER res)
  BYTE ARRAY b(200)
  PTR ARRAY a(20)
  CHAR ARRAY tmp(255)
  BYTE i,isEsc
  CHAR c

  Init(res,b,a)
  isEsc=0
  tmp(0)=0
  FOR i=1 TO s(0)
  DO
    c=s(i)
    IF isEsc THEN
      isEsc=0
      Append(tmp,c)
    ELSE
      IF c=esc THEN
        isEsc=1
      ELSEIF c=sep THEN
        AddToken(res,tmp)
        tmp(0)=0
      ELSE
        Append(tmp,c)
      FI
    FI
  OD
  AddToken(res,tmp)
RETURN

PROC Main()
  Tokens t

  Tokenize("one^|uno||three^^^^|four^^^|^cuatro|",'|,'^,t)
  PrintTokens(t)
RETURN
Output:

Screenshot from Atari 8-bit computer

"one|uno"
""
"three^^"
"four^|cuatro"
""

Ada[edit]

with Ada.Text_Io;
with Ada.Containers.Indefinite_Vectors;
with Ada.Strings.Unbounded;

procedure Tokenize is

   package String_Vectors is
     new Ada.Containers.Indefinite_Vectors (Positive, String);
   use String_Vectors;

   function Split (Text      : String;
                   Separator : Character := '|';
                   Escape    : Character := '^') return Vector
   is
      use Ada.Strings.Unbounded;
      Result  : Vector;
      Escaped : Boolean := False;
      Accu    : Unbounded_String;
   begin

      for Char of Text loop

         case Escaped is

            when False =>
               if Char = Escape then
                  Escaped := True;
               elsif Char = Separator then
                  Append (Result, To_String (Accu));
                  Accu := Null_Unbounded_String;
               else
                  Append (Accu, Char);
               end if;

            when True =>
               Append (Accu, Char);
               Escaped := False;

         end case;

      end loop;
      Append (Result, To_String (Accu));

      return Result;
   end Split;

   procedure Put_Vector (List : Vector) is
      use Ada.Text_Io;
   begin
      for Element of List loop
         Put ("'"); Put (Element); Put ("'"); New_Line;
      end loop;
   end Put_Vector;

begin
   Put_Vector (Split ("one^|uno||three^^^^|four^^^|^cuatro|"));
end Tokenize;
Output:
'one|uno'
''
'three^^'
'four^|cuatro'
''

ALGOL 68[edit]

BEGIN
    # returns s parsed according to delimiter and escape                      #
    PROC parse with escapes = ( STRING s, CHAR delimiter, escape )[]STRING:
         IF ( UPB s - LWB s ) + 1 < 1 THEN
            # empty string                                                    #
            [ 1 : 0 ]STRING empty array;
            empty array
         ELSE
            # at least one character                                          #
            # allow for a string composed entirely of delimiter characters    #
            [ 1 : ( UPB s - LWB s ) + 3 ]STRING result;
            INT r pos := 1;
            INT s pos := LWB s;
            result[ r pos ] := "";
            WHILE s pos <= UPB s DO
                CHAR c = s[ s pos ];
                IF  c = delimiter THEN
                    # start a new element                                     #
                    result[ r pos +:= 1 ] := ""
                ELIF c = escape THEN
                    # use the next character even if it is an escape          #
                    s pos +:= 1;
                    IF s pos < UPB s THEN
                        # the escape is not the last character                #
                        result[ r pos ] +:= s[ s pos ]
                    FI
                ELSE
                    # normal character                                        #
                    result[ r pos ] +:= c
                FI;
                s pos +:= 1
            OD;
            result[ 1 : r pos ]
         FI; # parse with escapes #
    # task test case                                                          #
    []STRING tokens = parse with escapes( "one^|uno||three^^^^|four^^^|^cuatro|", "|", "^" );
    FOR t pos FROM LWB tokens TO UPB tokens DO print( ( "[", tokens[ t pos ], "]", newline ) ) OD
END
Output:
[one|uno]
[]
[three^^]
[four^|cuatro]
[]

AppleScript[edit]

Translation of: JavaScript
------------------ TOKENIZE WITH ESCAPING ----------------

-- tokenize :: String -> Character -> Character -> [String]
on tokenize(str, delimChar, chrEsc)
    
    script charParse
        -- Record: {esc:Bool, token:String, tokens:[String]}
        -- charParse :: Record -> Character -> Record
        on |λ|(a, x)
            set blnEsc to esc of a
            set blnEscChar to ((not blnEsc) and (x = chrEsc))
            
            if ((not blnEsc) and (x = delimChar)) then
                set k to ""
                set ks to (tokens of a) & token of a
            else
                set k to (token of a) & cond(blnEscChar, "", x)
                set ks to tokens of (a)
            end if
            
            {esc:blnEscChar, token:k, tokens:ks}
        end |λ|
    end script
    
    set recParse to foldl(charParse, ¬
        {esc:false, token:"", tokens:[]}, splitOn("", str))
    
    tokens of recParse & token of recParse
end tokenize


--------------------------- TEST -------------------------
on run
    script numberedLine
        on |λ|(a, s)
            set iLine to lineNum of a
            {lineNum:iLine + 1, report:report of a & iLine & ":" & tab & s & linefeed}
        end |λ|
    end script
    
    report of foldl(numberedLine, {lineNum:1, report:""}, ¬
        tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^"))
end run


-------------------- GENERIC FUNCTIONS -------------------

-- foldl :: (a -> b -> a) -> a -> [b] -> a
on foldl(f, startValue, xs)
    tell mReturn(f)
        set v to startValue
        set lng to length of xs
        repeat with i from 1 to lng
            set v to |λ|(v, item i of xs, i, xs)
        end repeat
        return v
    end tell
end foldl


-- Lift 2nd class handler function into 1st class script wrapper 
-- mReturn :: Handler -> Script
on mReturn(f)
    if class of f is script then
        f
    else
        script
            property |λ| : f
        end script
    end if
end mReturn


-- splitOn :: String -> String -> [String]
on splitOn(pat, src)
    set {dlm, my text item delimiters} to ¬
        {my text item delimiters, pat}
    set xs to text items of src
    set my text item delimiters to dlm
    return xs
end splitOn


-- cond :: Bool -> a -> a -> a
on cond(bool, f, g)
    if bool then
        f
    else
        g
    end if
end cond
Output:
1:    one|uno
2:    
3:    three^^
4:    four^|cuatro
5:    

Arturo[edit]

tokenize: function [s sep esc][
	escaping: 0

	loop 0..(size s)-1 [i][
		chr: get split s i

		if? escaping=1 [
			prints chr
			escaping: 0
		]
		else [
			case [chr]
				when? [=sep] [print ""]
				when? [=esc] [escaping: 1]
				else [prints chr]
		]
	]
	print ""
]

str: "one^|uno||three^^^^|four^^^|^cuatro|" 
tokenize str "|" "^"
Output:
one|uno

three^^
four^|cuatro

AutoHotkey[edit]

Tokenize(s,d,e){
	for i,v in x:=StrSplit(StrReplace(StrReplace(StrReplace(s,e e,Chr(0xFFFE)),e d,Chr(0xFFFF)),e),d)
		x[i]:=StrReplace(StrReplace(v,Chr(0xFFFE),e),Chr(0xFFFF),d)
	return x
}
Examples:
str := "one^|uno||three^^^^|four^^^|^cuatro|"
for i, v in Tokenize(str, "|", "^")
	output .= i " : " v "`n"
MsgBox % output
Output:
1 : one|uno
2 : 
3 : three^^
4 : four^|cuatro
5 : 

BBC BASIC[edit]

REM >tokenizer
PROC_tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
END
:
DEF PROC_tokenize(src$, sep$, esc$)
LOCAL field%, char$, escaping%, i%
field% = 1
escaping% = FALSE
PRINT field%; " ";
FOR i% = 1 TO LEN src$
  char$ = MID$(src$, i%, 1)
  IF escaping% THEN
    PRINT char$;
    escaping% = FALSE
  ELSE
    CASE char$ OF
    WHEN sep$
      PRINT
      field% += 1
      PRINT field%; " ";
    WHEN esc$
      escaping% = TRUE
    OTHERWISE
      PRINT char$;
    ENDCASE
  ENDIF
NEXT
PRINT
ENDPROC
Output:
         1 one|uno
         2 
         3 three^^
         4 four^|cuatro
         5 

BQN[edit]

str ← "one^|uno||three^^^^|four^^^|^cuatro|"
Split ← ((⊢-˜+`׬)∘=⊔⊢)
SplitE ← {
  esc ← <`'^'=𝕩
  rem ← »esc
  spl ← (¬rem)∧'|'=𝕩 
  𝕩⊔˜(⊢-(esc∨spl)×1⊸+)+`spl
}

•Show SplitE str
⟨ "one|uno" ⟨⟩ "three^^" "four^|cuatro" ⟩

C[edit]

Works with: C
#include <stdlib.h>
#include <stdio.h>

#define STR_DEMO "one^|uno||three^^^^|four^^^|^cuatro|"
#define SEP '|'
#define ESC '^'

typedef char* Str; /* just for an easier reading */

/* ===> FUNCTION PROTOTYPES <================================================ */
unsigned int ElQ( const char *s, char sep, char esc );
Str *Tokenize( char *s, char sep, char esc, unsigned int *q );

/*==============================================================================
Main function.
Just passes a copy of the STR_DEMO string to the tokenization function and shows
the results.
==============================================================================*/

int main() {
    char s[] = STR_DEMO;
    unsigned int i, q;

    Str *list = Tokenize( s, SEP, ESC, &q );

    if( list != NULL ) {
        printf( "\n Original string: %s\n\n", STR_DEMO );
        printf( " %d tokens:\n\n", q );

        for( i=0; i<q; ++i )
            printf( " %4d. %s\n", i+1, list[i] );

        free( list );
    }

    return 0;
}

/*==============================================================================
"ElQ" stands for "Elements Quantity". Counts the amount of valid element in the
string s, according to the separator character provided in sep and the escape
character provided in esc.
==============================================================================*/

unsigned int ElQ( const char *s, char sep, char esc ) {
    unsigned int q, e;
    const char *p;

    for( e=0, q=1, p=s; *p; ++p ) {
        if( *p == esc )
            e = !e;
        else if( *p == sep )
            q += !e;
        else e = 0;
    }

    return q;
}

/*==============================================================================
The actual tokenization function.
Allocates as much dynamic memory as needed to contain the pointers to the
tokenized portions of the string passed as the "s" parameter, then looks for the
separators characters sep, paying attention to the occurrences of the escape
character provided in esc. When a valid separator is found, the function swaps
it with a '\0' terminator character and stores the pointer to the next string
into the array of pointers in dynamic memory. On output, the value of *q is the
number of pointers in the array. The caller is responsible for deallocating with
free() the returned array of pointers when it is no longer needed.
In case of failure, NULL is returned.
==============================================================================*/

Str *Tokenize( char *s, char sep, char esc, unsigned int *q ) {
    Str *list = NULL;

    *q = ElQ( s, sep, esc );
    list = malloc( *q * sizeof(Str) );

    if( list != NULL ) {
        unsigned int e, i;
        char *p;

        i = 0;
        list[i++] = s;

        for( e=0, p=s; *p; ++p ) {
            if( *p == esc ) {
                e = !e;
            }
            else if( *p == sep && !e ) {
                list[i++] = p+1;
                *p = '\0';
            }
            else {
                e = 0;
            }
        }
    }

    return list;
}
Output:
 Original string: one^|uno||three^^^^|four^^^|^cuatro|

 5 tokens:

    1. one^|uno
    2.
    3. three^^^^
    4. four^^^|^cuatro
    5.

C#[edit]

using System;
using System.Text;
using System.Collections.Generic;

public class TokenizeAStringWithEscaping
{
    public static void Main() {
        string testcase = "one^|uno||three^^^^|four^^^|^cuatro|";
        foreach (var token in testcase.Tokenize(separator: '|', escape: '^')) {
            Console.WriteLine(": " + token); //Adding a : so we can see empty lines
        }
    }
}

public static class Extensions
{
    public static IEnumerable<string> Tokenize(this string input, char separator, char escape) {
        if (input == null) yield break;
        var buffer = new StringBuilder();
        bool escaping = false;
        foreach (char c in input) {
            if (escaping) {
                buffer.Append(c);
                escaping = false;
            } else if (c == escape) {
                escaping = true;
            } else if (c == separator) {
                yield return buffer.Flush();
            } else {
                buffer.Append(c);
            }
        }
        if (buffer.Length > 0 || input[input.Length-1] == separator) yield return buffer.Flush();
    }
    
    public static string Flush(this StringBuilder stringBuilder) {
        string result = stringBuilder.ToString();
        stringBuilder.Clear();
        return result;
    }
}
Output:
: one|uno
: 
: three^^
: four^|cuatro
: 

C++[edit]

#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>

using namespace std;

vector<string> tokenize(const string& input, char seperator, char escape) {
    vector<string> output;
    string token;

    bool inEsc = false;
    for (char ch : input) {
        if (inEsc) {
            inEsc = false;
        } else if (ch == escape) {
            inEsc = true;
            continue;
        } else if (ch == seperator) {
            output.push_back(token);
            token = "";
            continue;
        }
        token += ch;
    }
    if (inEsc)
        throw new invalid_argument("Invalid terminal escape");

    output.push_back(token);
    return output;
}

int main() {
    string sample = "one^|uno||three^^^^|four^^^|^cuatro|";

    cout << sample << endl;
    cout << '[';
    for (auto t : tokenize(sample, '|', '^')) {
        cout << '"' << t << "\", ";
    }
    cout << ']' << endl;

    return 0;
}
Output:
one^|uno||three^^^^|four^^^|^cuatro|
["one|uno", "", "three^^", "four^|cuatro", "", ]

CLU[edit]

tokenize = iter (sep, esc: char, s: string) yields (string)
    escape: bool := false
    part: array[char] := array[char]$[]
    for c: char in string$chars(s) do   
        if escape then 
            escape := false
            array[char]$addh(part,c)
        elseif c=esc then
            escape := true
        elseif c=sep then
            yield(string$ac2s(part))
            part := array[char]$[]
        else
            array[char]$addh(part,c)
        end
    end
    yield(string$ac2s(part))
end tokenize

start_up = proc ()
    po: stream := stream$primary_output()
    testcase: string := "one^|uno||three^^^^|four^^^|^quatro|"
    
    for part: string in tokenize('|', '^', testcase) do
        stream$putl(po, "\"" || part || "\"")
    end
end start_up
Output:
"one|uno"
""
"three^^"
"four^|quatro"
""

COBOL[edit]

       >>SOURCE FORMAT FREE
identification division.
program-id. 'tokenizewithescaping'.
environment division.
configuration section.
repository.
    function all intrinsic.
data division.
working-storage section.

01 escape-char pic x value '^'.
01 separator-char pic x value '|'.
01 reference-string pic x(64) value
   'one^|uno||three^^^^|four^^^|^cuatro|'.

01 input-string pic x(64).
01 c pic 99.
01 escaped pic x.

01 t pic 99.
01 t-max pic 99.
01 t-lim pic 99 value 32.
01 token-entry occurs 32.
   03  token-len pic 99.
   03  token pic x(16).

01 l pic 99.
01 l-lim pic 99 value 16.

01 error-found pic x.

procedure division.
start-tokenize-with-escaping.

    move reference-string to input-string
    perform tokenize

    move 'token' to input-string
    perform tokenize
   
    move '^^^^^^^^' to input-string
    perform tokenize
   
    move '||||||||' to input-string
    perform tokenize

    move all 'token' to input-string
    perform tokenize

    move all 't|' to input-string
    perform tokenize

    move spaces to input-string
    perform tokenize

    display space

    stop run
    .
tokenize.
    display space
    display 'string:'
    display input-string

    move 'N' to escaped error-found
    move 1 to t-max
    initialize token-entry(t-max)
    move 0 to l

    perform varying c from 1 by 1 until
    c > length(input-string)
    or input-string(c:) = spaces

        evaluate escaped also input-string(c:1)
        when 'N' also escape-char
            move 'Y' to escaped
        when 'N' also separator-char
            perform increment-t-max
            if error-found = 'Y'
                exit paragraph
            end-if
        when 'N' also any
            perform move-c
            if error-found = 'Y'
                exit paragraph
            end-if
        when 'Y' also any
            perform move-c
            if error-found = 'Y'
                exit paragraph
            end-if
            move 'N' to escaped
        end-evaluate

    end-perform
    if l > 0
        move l to token-len(t-max)
    end-if

    if c = 1
        display 'no tokens'
    else
        display 'tokens:'
        perform varying t from 1 by 1 until t > t-max
            if token-len(t) > 0
                display t ': ' token-len(t) space token(t)
            else
                display t ': ' token-len(t)
            end-if
        end-perform
    end-if
    .
increment-t-max.
    if t-max >= t-lim
        display 'error: at ' c ' number of tokens exceeds ' t-lim
        move 'Y' to error-found
    else
        move l to token-len(t-max)
        add 1 to t-max
        initialize token-entry(t-max)
        move 0 to l
        move 'N' to error-found
    end-if
    .
move-c.
    if l >= l-lim
        display 'error: at ' c ' token length exceeds ' l-lim
        move 'Y' to error-found
    else
        add 1 to l
        move input-string(c:1) to token(t-max)(l:1)
        move 'N' to error-found
    end-if
    .
end program 'tokenizewithescaping'.
Output:
$ cobc -xj tokenizewithescaping.cbl 
 
string:
one^|uno||three^^^^|four^^^|^cuatro|                            
tokens:
01: 07 one|uno         
02: 00
03: 07 three^^         
04: 12 four^|cuatro    
05: 00
 
string:
token                                                           
tokens:
01: 05 token           
 
string:
^^^^^^^^                                                        
tokens:
01: 04 ^^^^            
 
string:
||||||||                                                        
tokens:
01: 00
02: 00
03: 00
04: 00
05: 00
06: 00
07: 00
08: 00
09: 00
 
string:
tokentokentokentokentokentokentokentokentokentokentokentokentoke
error: at 17 token length exceeds 16
 
string:
t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|t|
error: at 64 number of tokens exceeds 32
 
string:
                                                                
no tokens

Common Lisp[edit]

(defun split (input separator escape)
  (flet ((make-string-buffer ()
           (make-array 0 :element-type 'character :adjustable t :fill-pointer t)))
    (loop with token = (make-string-buffer)
          with result = nil
          with to-be-escaped = nil
          for ch across input
          do (cond (to-be-escaped
                    (vector-push-extend ch token)
                    (setf to-be-escaped nil))
                   ((char= ch escape)
                    (setf to-be-escaped t))
                   ((char= ch separator)
                    (push token result)
                    (setf token (make-string-buffer)))
                   (t
                    (vector-push-extend ch token)))
          finally (push token result)
                  (return (nreverse result)))))

(defun main ()
  (dolist (token (split "one^|uno||three^^^^|four^^^|^cuatro|" #\| #\^))
    (format t "'~A'~%" token)))
Output:
'one|uno'
''
'three^^'
'four^|cuatro'
''

D[edit]

Translation of: Java
import std.stdio;

void main() {
    string sample = "one^|uno||three^^^^|four^^^|^cuatro|";

    writeln(sample);
    writeln(tokenizeString(sample, '|', '^'));
}

auto tokenizeString(string source, char seperator, char escape) {
    import std.array : appender;
    import std.exception : enforce;

    auto output = appender!(string[]);
    auto token = appender!(char[]);

    bool inEsc;
    foreach(ch; source) {
        if (inEsc) {
            inEsc = false;
        } else if (ch == escape) {
            inEsc = true;
            continue;
        } else if (ch == seperator) {
            output.put(token.data.idup);
            token.clear();
            continue;
        }
        token.put(ch);
    }
    enforce(!inEsc, "Invalid terminal escape");

    output.put(token.data.idup);
    return output.data;
}
Output:
one^|uno||three^^^^|four^^^|^cuatro|
["one|uno", "", "three^^", "four^|cuatro", ""]

Dyalect[edit]

Translation of: C#
func String.Tokenize(separator, escape) {
    var buffer = []
    var escaping = false
    for c in this {
        if escaping {
            buffer.Add(c)
            escaping = false
        } else if c == escape {
            escaping = true
        } else if c == separator {
            yield buffer.Flush();
        } else {
            buffer.Add(c);
        }
    }
 
    if buffer.Length() > 0 || this[this.Length() - 1] == separator {
        yield buffer.Flush()
    }
}
 
func Array.Flush() {
    var str = String.Concat(values: this)
    this.Clear()
    str
}
 
let testcase = "one^|uno||three^^^^|four^^^|^cuatro|";
for token in testcase.Tokenize(separator: '|', escape: '^') {
    print(": \(token)")
}
Output:
: one|uno
:
: three^^
: four^|cuatro
:

Elena[edit]

Translation of: C#

ELENA 4.x :

import extensions;
import extensions'routines;
import system'collections;
import system'routines;
import system'text;
 
extension op : String
{
    tokenize(separator,escape)
    {
        auto buffer := new TextBuilder();
        auto list := new ArrayList();
 
        bool escaping := false;
        self.forEach:(ch)
        {
            if (escaping)
            {
                buffer.write:ch;
                escaping := false
            }
            else if (ch == escape)
            {
                escaping := true
            }
            else if (ch == separator)
            {
                list.append(buffer.Value);
                buffer.clear()
            }
            else
            {
                buffer.write:ch
            }
        };
 
        ^ list
    }
}
 
const string testcase = "one^|uno||three^^^^|four^^^|^cuatro|";
 
public program()
{
    testcase.tokenize("|", "^").forEach:printingLn
}
Output:
one|uno

three^^
four^|cuatro

F#[edit]

open System
open System.Text.RegularExpressions

(*
    .NET regexes have unlimited look-behind, so we can look for separators
    which are preceeded by an even number of (or no) escape characters
*)
let split esc sep s =
    Regex.Split (
        s,
        String.Format("(?<=(?:\b|[^{0}])(?:{0}{0})*){1}", Regex.Escape(esc), Regex.Escape(sep))
        )

let unescape esc s =
    Regex.Replace(
        s,
        Regex.Escape(esc) + "(.)",
        "$1"
        )

[<EntryPoint>]
let main argv =
    let (esc, sep) = ("^", "|")
    "one^|uno||three^^^^|four^^^|^cuatro|"
    |> split esc sep
    |> Seq.map (unescape esc)
    |> Seq.iter (fun s -> printfn "'%s'" s)
    0
Output:
'one|uno'
''
'three^^'
'four^|cuatro'
''

Factor[edit]

This example uses Factor's parser-combinators vocabulary, which is modeled after Haskell's parser combinators. Page 51 of this pdf contains a useful introduction to this vocabulary.

Works with: Factor version 0.99 2019-10-06
USING: accessors kernel lists literals namespaces
parser-combinators prettyprint sequences strings ;

SYMBOLS: esc sep ;

: set-chars ( m n -- ) [ sep set ] [ esc set ] bi* ;
: escape ( -- parser ) esc get 1token ;
: escaped ( -- parser ) escape any-char-parser &> ;
: separator ( -- parser ) sep get 1token ;

: character ( -- parser )
    ${ esc get sep get } [ member? not ] curry satisfy ;

: my-token ( -- parser ) escaped character <|> <*> ;

: token-list ( -- parser )
    my-token separator list-of [ [ >string ] map ] <@ ;

: tokenize ( str sep-char esc-char -- seq )
    set-chars token-list parse car parsed>> ;

"one^|uno||three^^^^|four^^^|^cuatro|"
CHAR: | CHAR: ^ tokenize .
Output:
{ "one|uno" "" "three^^" "four^|cuatro" "" }

Forth[edit]

variable	   'src
variable	   #src
variable     offset

: advance    1 offset +! ;
: chr@       offset @ 'src @ + c@ ;
: nextchr    advance chr@ ;
: bound      offset @ #src @ u< ;
: separator? dup [char] | = if drop cr             else  emit      then ;
: escape?    dup [char] ^ = if drop  nextchr  emit else separator? then ;
: tokenize   0 offset ! begin bound while nextchr escape? repeat ;

\ Test of  function
Here 'src ! ," one^|uno||three^^^^|four^^^|^cuatro|" here  'src @ - #src  !
page
cr ." #### start ####" cr tokenize cr ." #### End  ####" cr
Output:
#### start ####
one|uno

three^^
four^|cuatro

#### End  ####


Fortran[edit]

First Fortran (1958) offered no facilities for inspecting or manipulating text, until Fortran IV when the A format code was introduced whereby text could be read or written from numeric variables. The difficulties and incompatibilities between different computers were eased with F77 that offered CHARACTER*n variables, though they are not quite strings that have a varying length. F95 introduces the ability to define a compound entity such as a string and F2003 standardised a version of strings whereby with each assignment to such a variable, it would be re-allocated with the required amount of storage. Otherwise, one proceeds with CHARACTER variables and an associated variable containing its current length as with TOKEN and L. However, when passed to subroutines (or functions) as a parameter, a CHARACTER variable is supplied along with a secret additional parameter giving the size of the variable, and this is stringlike, so long as there is no need to change the length. Thus, the length of parameter TEXT to subroutine SPLIT can be found via LEN(TEXT).

The source style is F90 simply for the convenience of having subroutine SPLOT defined within subroutine SPLIT so as to gain access to certain variables. If separate subroutines were to be used, then there would have to be parameters or COMMON variables, or, one could just replicate the code within SPLIT. A further F90 feature involves declaring the size of internal variable TOKEN to be LEN(TEXT), which is surely the largest it could be. Otherwise, one would have to select some "surely big enough" value.
      SUBROUTINE SPLIT(TEXT,SEP,ESC)	!Identifies and prints tokens from within a text.
       CHARACTER*(*) TEXT	!To be scanned.
       CHARACTER*(1) SEP	!The only separator for tokens.
       CHARACTER*(1) ESC	!Miscegnator.
       CHARACTER*(LEN(TEXT)) TOKEN	!Surely sufficient space.
       INTEGER N	!Counts the tokens as they're found.
       INTEGER I	!Steps through the text.
       INTEGER L	!Length of the token so far accumulated.
       LOGICAL ESCAPING	!Miscegnatory state.
        N = 0		!No tokens so far.
        L = 0		!Nor any text for the first.
        ESCAPING = .FALSE.	!And the state is good.
        DO I = 1,LEN(TEXT)	!Step through the text.
          IF (ESCAPING) THEN	!Are we in a mess?
            L = L + 1			!Yes. An ESC character had been seen.
            TOKEN(L:L) = TEXT(I:I)	!So, whatever follows is taken as itself.
            ESCAPING = .FALSE.		!There are no specially-recognised names.
           ELSE			!Otherwise, we're in text to inspect.
            IF (TEXT(I:I).EQ.ESC) THEN	!So, is it a troublemaker?
             ESCAPING = .TRUE.			!Yes! Trouble is to follow.
            ELSE IF (TEXT(I:I).EQ.SEP) THEN	!If instead a separator,
             CALL SPLOT				!Then the token up to it is complete.
            ELSE			!Otherwise, a simple constituent character.
             L = L + 1				!So, count it in.
             TOKEN(L:L) = TEXT(I:I)		!And copy it in.
            END IF			!So much for grist.
          END IF		!So much for that character.
        END DO			!On to the next.
Completes on end-of-text with L > 0, or, if the last character had been SEP, a null token is deemed to be following.
        CALL SPLOT	!Tail end.
       CONTAINS	!Save on having two copies of this code.
        SUBROUTINE SPLOT	!Show the token and scrub.
         N = N + 1			!Another one.
         WRITE (6,1) N,TOKEN(1:L)	!Reveal.
    1    FORMAT ("Token ",I0," >",A,"<")!Fancy layout.
         L = 0				!Prepare for a fresh token.
        END SUBROUTINE SPLOT	!A brief life.
      END SUBROUTINE SPLIT	!And then oblivion.

      PROGRAM POKE

      CALL SPLIT("one^|uno||three^^^^|four^^^|^cuatro|","|","^")

      END

The output has the text of the tokens marked >thus<

Token 1 >one|uno<
Token 2 ><
Token 3 >three^^<
Token 4 >four^|cuatro<
Token 5 ><

The terminating separator character is deemed to mark the start of a null token in the problem's specification. If the text ends without one, then the end-of-text ends the token and the DO-loop quits with L > 0 and so something for SPLOT. If the penultimate character were an ESC followed by a SEP, then the loop also ends with L > 0. If the text ends with a SEP but not preceded by an ESC (as in the example) then L = 0 - but SPLOT is invoked unconditionally. A SEP at the start of the text will also elicit a null token, as will an entirely null text.

If the text ends with an ESC, then this is surely a mistake and could be caught via a test that ESCAPING was true on exit from the loop. But no error messages are called for...

In this example the DO-loop relentlessly steps through the text, and in general this would not be convenient. Normally, token identification proceeds within a much larger context where one would not discard the token immediately after it is isolated, and rather than copying the text hither and thither, one might prefer to identify it in-place, say with variables L1 and L2 identifying the start and end positions within the working area. In such a case there would no longer be a need for a variable TOKEN and the angst of deciding on a suitable maximum size. This would also make it easier in any error messages to show context and provenance. However, the bizarre miscegnation of "escape" sequences (especially confusing within text literals), means that the source text does not necessarily constitute the text of the token.


FreeBASIC[edit]

Translation of: Ring
Sub tokenize(cadena As String, separador As String, escape As String)
    Dim As Integer campo = 1
    Dim As Boolean escapando = false
    Dim As String char
    Print ""; campo; " ";
    
    For i As Integer = 1 To Len(cadena)
        char = Mid(cadena, i, 1)
        If escapando  Then
            Print char;
            escapando = false
        Else
            Select Case char 
            Case separador
                Print
                campo += 1
                Print ""; campo; " ";
            Case escape
                escapando = true
            Case Else
                Print char;
            End Select
        End If
    Next i
    Print
End Sub

tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
Sleep
Output:
Igual que la entrada de Ring.


Go[edit]

package main

import (
	"errors"
	"fmt"
)

func TokenizeString(s string, sep, escape rune) (tokens []string, err error) {
	var runes []rune
	inEscape := false
	for _, r := range s {
		switch {
		case inEscape:
			inEscape = false
			fallthrough
		default:
			runes = append(runes, r)
		case r == escape:
			inEscape = true
		case r == sep:
			tokens = append(tokens, string(runes))
			runes = runes[:0]
		}
	}
	tokens = append(tokens, string(runes))
	if inEscape {
		err = errors.New("invalid terminal escape")
	}
	return tokens, err
}

func main() {
	const sample = "one^|uno||three^^^^|four^^^|^cuatro|"
	const separator = '|'
	const escape = '^'

	fmt.Printf("Input:   %q\n", sample)
	tokens, err := TokenizeString(sample, separator, escape)
	if err != nil {
		fmt.Println("error:", err)
	} else {
		fmt.Printf("Tokens: %q\n", tokens)
	}
}
Output:
Input:   "one^|uno||three^^^^|four^^^|^cuatro|"
Tokens: ["one|uno" "" "three^^" "four^|cuatro" ""]

Haskell[edit]

Deterministic Finite Automaton[edit]

splitEsc :: (Foldable t1, Eq t) => t -> t -> t1 t -> [[t]]
splitEsc sep esc = reverse . map reverse . snd . foldl process (0, [[]])
  where process (st, r:rs) ch
          | st == 0 && ch == esc               = (1,      r:rs)
          | st == 0 && ch == sep               = (0,   []:r:rs)
          | st == 1 && sep == esc && ch /= sep = (0, [ch]:r:rs)
          | otherwise                          = (0, (ch:r):rs)
Output:
λ> splitEsc '|' '^' "one^|uno||three^^^^|four^^^|^cuatro|"
["one|uno","","three^^","four^|cuatro",""]

The solution works with any foldable structures.

λ> splitEsc 11 0 [2,3,11,3,4,5,11,0,11,2,3,4]
[[2,3],[3,4,5],[11,2,3,4]]

It handles pathological case when separator and escape are the same:

λ> split '|' '|' "one^|uno||three^^^^|four^^^|^cuatro|"
["one^","uno|three^^^^","four^^^","^cuatro"]

For splitting lists without escaping see Data.List.Split package.

Counduit-based solution[edit]

Constant in space (~ O(k), where k -- is token length), as fast as DFA-based solution.

{-#Language LambdaCase #-}
import Conduit

splitEscC :: (Monad m, Eq t) => t -> t -> Conduit t m [t]
splitEscC sep esc = mapOutput reverse $ go True [] 
  where
    go notEsc b = await >>= \case 
      Nothing -> yield b
      Just ch | notEsc && ch == esc -> go False b
              | notEsc && ch == sep -> yield b >> go True []
              | otherwise -> go True (ch:b)

This new conduit could be used in a pipeline as follows:

main = runConduit $
  yieldMany "one^|uno||three^^^^|four^^^|^cuatro|"
  .| splitEscC '|' '^'
  .| mapM_C print
λ> main
"one|uno"
""
"three^^"
"four^|cuatro"
""

Alternative[edit]

This is essentially equivalent to the first (DFA) example, but, though possibly less elegant than the guard idiom, appears to be fractionally faster with larger (eg 180k) test strings.

import Data.Bool (bool)

------------------ TOKENIZE WITH ESCAPING ----------------

tokenize :: Char -> Char -> String -> [String]
tokenize delim esc str =
  reverse $
    reverse <$> (token : list)
  where
    (token, list, _) =
      foldr
        ( \x (aToken, aList, aEsc) ->
            let literal = not aEsc
                isEsc = literal && (x == esc)
             in bool
                  ( bool (x : aToken) aToken isEsc,
                    aList,
                    isEsc
                  )
                  ([], aToken : aList, isEsc)
                  (literal && x == delim)
        )
        ([], [], False)
        (reverse str)

--------------------------- TEST -------------------------
main :: IO ()
main =
  mapM_ print $
    tokenize
      '|'
      '^'
      "one^|uno||three^^^^|four^^^|^cuatro|"
Output:
"one|uno"
""
"three^^"
"four^|cuatro"
""

J[edit]

From the python example:

tokenize1=: tokenize =: '^|'&$: :(4 : 0)
 'ESC SEP' =. x
 STATE =. 0
 RESULT =. 0 $ a:
 TOKEN =. ''
 for_C. y do.
  if. STATE do.
   TOKEN =. TOKEN , C
   STATE =. 0
  else.
   if. C = ESC do.
    STATE =. 1
   elseif. C = SEP do.
    RESULT =. RESULT , < TOKEN
    TOKEN =. ''
   elseif. do.
    TOKEN =. TOKEN , C
   end.
  end.
 end.
 RESULT =. RESULT , < TOKEN
)
tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
┌───────┬┬───────┬────────────┬┐
│one|uno││three^^│four^|cuatro││
└───────┴┴───────┴────────────┴┘
   

Here's a somewhat more efficient approach (over 100 times faster on a 100k textual example):

tokenize2=: tokenize=:3 :0
  '^|' tokenize2 y  NB. task default escape and separator
:
  'ESC SEP'=. x
  E=. 18 b./\.&.|.ESC=y NB. escape positions
  S=. (SEP=y)>_1}.0,E NB. separator positions
  K=. -.E+.S NB. keep positions
  T=. (#y){. 1,}.S NB. token beginnings
  (T<;.1 K)#&.>T<;.1 y
)

Example use:

   '^|' tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
┌───────┬┬───────┬────────────┬┐
one|uno││three^^four^|cuatro││
└───────┴┴───────┴────────────┴┘


Solution invoking the sequential machine primitive verb.[this thread.]
charTokens =: (0;(3 2 2$(2 1 1 1 2 2 1 2 1 0 1 0));<<'^')&;:  NB. sequential machine
splitTokens =: ((<,'|')&= <;._1 ])@:((<,'|'),])
removeExtra =: (}.^:(1<#)) L:0
tokenize3=: tokenize=: ; each @: (removeExtra @: splitTokens @: charTokens)
Example use:
   t=: 'one^|uno||three^^^^|four^^^|^cuatro|'

  tokenize t
┌───────┬┬───────┬────────────┬┐
one|uno││three^^four^|cuatro││
└───────┴┴───────┴────────────┴┘

   $tokenize t
5

Relative efficiencies:

   txt=: 1e5$'one^|uno||three^^^^|four^^^|^cuatro|'
   
   (%"1 <./) timespacex every 'tokenize1 txt';'tokenize2 txt';'tokenize3 txt'
132.856       1
      1 7.73534
8.29568 19.9766

So tokenize2 is the fastest, while tokenize1 uses the least amount of memory. Also, tokenize1 is the slowest and tokenize3 uses the most memory. (First column is relative time used, second is relative space used, rows correspond to implementations.)

Java[edit]

Translation of: Go
Works with: Java version 7
import java.util.*;

public class TokenizeStringWithEscaping {

    public static void main(String[] args) {
        String sample = "one^|uno||three^^^^|four^^^|^cuatro|";
        char separator = '|';
        char escape = '^';

        System.out.println(sample);
        try {
            System.out.println(tokenizeString(sample, separator, escape));
        } catch (Exception e) {
            System.out.println(e);
        }
    }

    public static List<String> tokenizeString(String s, char sep, char escape)
            throws Exception {
        List<String> tokens = new ArrayList<>();
        StringBuilder sb = new StringBuilder();

        boolean inEscape = false;
        for (char c : s.toCharArray()) {
            if (inEscape) {
                inEscape = false;
            } else if (c == escape) {
                inEscape = true;
                continue;
            } else if (c == sep) {
                tokens.add(sb.toString());
                sb.setLength(0);
                continue;
            }
            sb.append(c);
        }
        if (inEscape)
            throw new Exception("Invalid terminal escape");

        tokens.add(sb.toString());

        return tokens;
    }
}
[one|uno, , three^^, four^|cuatro, ]

JavaScript[edit]

ES5[edit]

Iterative[edit]

function tokenize(s, esc, sep) {
	for (var a=[], t='', i=0, e=s.length; i<e; i+=1) {
		var c = s.charAt(i)
		if (c == esc) t+=s.charAt(++i)
		else if (c != sep) t+=c
		else a.push(t), t=''		
	}
	a.push(t)
	return a
}

var s = 'one^|uno||three^^^^|four^^^|^cuatro|'
document.write(s, '<br>')	
for (var a=tokenize(s,'^','|'), i=0; i<a.length; i+=1) document.write(i, ': ', a[i], '<br>')
Output:
one^|uno||three^^^^|four^^^|^cuatro|
0: one|uno
1: 
2: three^^
3: four^|cuatro
4: 

Functional[edit]

(function () {
    'use strict';

    // tokenize :: String -> Character -> Character -> [String]
    function tokenize(str, charDelim, charEsc) {
        var dctParse = str.split('')
            .reduce(function (a, x) {

                var blnEsc = a.esc,
                    blnBreak = !blnEsc && x === charDelim,
                    blnEscChar = !blnEsc && x === charEsc;

                return {
                    esc: blnEscChar,
                    token: blnBreak ? '' : (
                        a.token + (blnEscChar ? '' : x)
                    ),
                    list: a.list.concat(blnBreak ? a.token : [])
                };
            }, {
                esc: false,
                token: '',
                list: []
            });

        return dctParse.list.concat(
            dctParse.token
        );
    }

    return tokenize(
            'one^|uno||three^^^^|four^^^|^cuatro|',
            '|','^'
        )
        .join('\n');

})();
Output:
one|uno

three^^
four^|cuatro

ES6[edit]

Hand-parsed[edit]

Translation of: Haskell
(Single fold version)
((() => {

    // tokenize :: String -> Character -> Character -> [String]
    const tokenize = (charDelim, charEsc, str) => {
        const [token, list, _] = str.split('')
            .reduce(([aToken, aList, aEsc], x) => {
                const
                    blnBreak = !aEsc && x === charDelim,
                    blnEscChar = !aEsc && x === charEsc;

                return [
                    blnBreak ? '' : (
                        aToken + (blnEscChar ? '' : x)
                    ),
                    aList.concat(blnBreak ? aToken : []),
                    blnEscChar
                ];
            }, ['', [], false]);

        return list.concat(token);
    };

    // splitEsc :: String -> [String]
    const splitEsc = str => tokenize('|', '^', str);


    // TEST
    // show :: a -> String
    const show = x => JSON.stringify(x, null, 2);

    return splitEsc(
            'one^|uno||three^^^^|four^^^|^cuatro|',
        )
        .map(show)
        .join('\n');
}))();
Output:
"one|uno"
""
"three^^"
"four^|cuatro"
""

Parser combinators[edit]

Defining the function as a composition of generics from a parser combinator library:

(() => {
    'use strict';

    // ------ TOKENIZATION OF A STRING WITH ESCAPES ------

    // tokenizedWithEscapes :: Char -> Char -> 
    // String -> [String]
    const tokenizedWithEscapes = esc =>
        // A list of tokens in a given string, 
        // where the separator character is sep
        // and any character may be escaped by 
        // a preceding esc character.
        sep => compose(
            concatMap(fst),
            parse(
                sepBy(
                    takeWhileEscP(esc)(
                        constant(true)
                    )(
                        ne(sep)
                    )
                )(char(sep))
            )
        );

    // ---------------------- TEST -----------------------
    // main :: IO ()
    const main = () =>
        JSON.stringify(
            tokenizedWithEscapes('^')('|')(
                'one^|uno||three^^^^|four^^^|^cuatro|'
            ),
            null, 2
        );
    // -->
    // [
    //     "one|uno",
    //     "",
    //     "three^^",
    //     "four^|cuatro",
    //     ""
    // ]

    // ----------- GENERIC PARSER COMBINATORS ------------

    // Parser :: String -> [(a, String)] -> Parser a
    const Parser = f =>
        // A function lifted into a Parser object.
        ({
            type: 'Parser',
            parser: f
        });


    // altP (<|>) :: Parser a -> Parser a -> Parser a
    const altP = p =>
        // p, or q if p doesn't match.
        q => Parser(s => {
            const xs = parse(p)(s);
            return 0 < xs.length ? (
                xs
            ) : parse(q)(s);
        });


    // anyChar :: () -> Parser Char
    const anyChar = () =>
        // A single character.
        Parser(
            s => 0 < s.length ? [
                Tuple(s[0])(
                    s.slice(1)
                )
            ] : []
        );


    // apP <*> :: Parser (a -> b) -> Parser a -> Parser b
    const apP = pf =>
        // A new parser obtained by the application 
        // of a Parser-wrapped function,
        // to a Parser-wrapped value.
        p => Parser(
            s => parse(pf)(s).flatMap(
                vr => parse(
                    fmapP(vr[0])(p)
                )(vr[1])
            )
        );


    // bindP (>>=) :: Parser a -> 
    // (a -> Parser b) -> Parser b
    const bindP = p =>
        // A new parser obtained by the application of 
        // a function to a Parser-wrapped value.
        // The function must enrich its output, lifting it 
        // into a new Parser.
        // Allows for the nesting of parsers.
        f => Parser(
            s => parse(p)(s).flatMap(
                tpl => parse(f(tpl[0]))(tpl[1])
            )
        );


    // char :: Char -> Parser Char
    const char = x =>
        // A particular single character.
        satisfy(c => x == c);


    // fmapP :: (a -> b) -> Parser a -> Parser b  
    const fmapP = f =>
        // A new parser derived by the structure-preserving 
        // application of f to the value in p.
        p => Parser(
            s => parse(p)(s).flatMap(
                first(f)
            )
        );


    // liftA2P :: (a -> b -> c) -> 
    // Parser a -> Parser b -> Parser c
    const liftA2P = op =>
        // The binary function op, lifted
        // to a function over two parsers.
        p => apP(fmapP(op)(p));


    // many :: Parser a -> Parser [a]
    const many = p => {
        // Zero or more instances of p.
        // Lifts a parser for a simple type of value 
        // to a parser for a list of such values.
        const some_p = p =>
            liftA2P(
                x => xs => [x].concat(xs)
            )(p)(many(p));
        return Parser(
            s => parse(
                0 < s.length ? (
                    altP(some_p(p))(pureP(''))
                ) : pureP('')
            )(s)
        );
    };


    // parse :: Parser a -> String -> [(a, String)]
    const parse = p =>
        // The result of parsing a string with p.
        p.parser;


    // pureP :: a -> Parser a
    const pureP = x =>
        // The value x lifted, unchanged, 
        // into the Parser monad.
        Parser(s => [Tuple(x)(s)]);


    // satisfy :: (Char -> Bool) -> Parser Char
    const satisfy = test =>
        // Any character for which the 
        // given predicate returns true.
        Parser(
            s => 0 < s.length ? (
                test(s[0]) ? [
                    Tuple(s[0])(s.slice(1))
                ] : []
            ) : []
        );


    // sepBy :: Parser a -> Parser b -> Parser [a]
    const sepBy = p =>
        // Zero or more occurrences of p, as 
        // separated by (discarded) instances of sep.
        sep => altP(
            sepBy1(p)(sep)
        )(
            pureP([])
        );


    // sepBy1 :: Parser a -> Parser b -> Parser [a]
    const sepBy1 = p =>
        // One or more occurrences of p, as 
        // separated by (discarded) instances of sep.
        sep => bindP(
            p
        )(x => bindP(
            many(
                thenP(sep)(
                    bindP(p)(pureP)
                )
            )
        )(xs => pureP([x].concat(xs))));


    // takeWhileEscP :: Char -> (Char -> Bool) -> 
    // (Char -> Bool) -> Parser Text
    const takeWhileEscP = esc =>
        escTest => test => {
            // Longest prefix, including any escaped
            // characters, in which escTest returns
            // true for all escaped characters, and
            // test returns true for all other chars.
            const plain = takeWhileP(
                c => (esc !== c) && test(c)
            );
            const escaped = thenBindP(
                char(esc)
            )(
                anyChar()
            )(x => bindP(
                plain
            )(
                compose(pureP, cons(x))
            ));
            return bindP(
                plain
            )(x => bindP(
                many(escaped)
            )(xs => pureP(concat([x].concat(xs)))));
        };


    // takeWhileP :: (Char -> Bool) -> Parser String
    const takeWhileP = p =>
        // The largest prefix in which p is
        // true over all the characters.
        Parser(
            compose(
                pureList,
                first(concat),
                span(p)
            )
        );


    // thenBindP :: Parser a -> Parser b -> 
    // (b -> Parser c) Parser c
    const thenBindP = o =>
        // A combination of thenP and bindP in which a 
        // preliminary  parser consumes text and discards
        // its output, before any output of a subsequent
        // parser is bound.
        p => f => Parser(
            s => parse(o)(s).flatMap(
                vr => parse(p)(vr[1]).flatMap(
                    tpl => parse(f(tpl[0]))(tpl[1])
                )
            )
        );


    // thenP (>>) :: Parser a -> Parser b -> Parser b
    const thenP = o =>
        // A composite parser in which o just consumes text
        // and then p consumes more and returns a value.
        p => Parser(
            s => parse(o)(s).flatMap(
                vr => parse(p)(vr[1])
            )
        );


    // --------------------- GENERIC ---------------------

    // Tuple (,) :: a -> b -> (a, b)
    const Tuple = a =>
        b => ({
            type: 'Tuple',
            '0': a,
            '1': b,
            length: 2
        });


    // compose (<<<) :: (b -> c) -> (a -> b) -> a -> c
    const compose = (...fs) =>
        // A function defined by the right-to-left
        // composition of all the functions in fs.
        fs.reduce(
            (f, g) => x => f(g(x)),
            x => x
        );


    // concat :: [[a]] -> [a]
    // concat :: [String] -> String
    const concat = xs => (
        ys => 0 < ys.length ? (
            ys.every(Array.isArray) ? (
                []
            ) : ''
        ).concat(...ys) : ys
    )(list(xs));


    // concatMap :: (a -> [b]) -> [a] -> [b]
    const concatMap = f =>
        // List monad bind operator.
        xs => xs.flatMap(f);


    // cons :: a -> [a] -> [a]
    const cons = x =>
        // A list constructed from the item x,
        // followed by the existing list xs.
        xs => Array.isArray(xs) ? (
            [x].concat(xs)
        ) : 'GeneratorFunction' !== xs
        .constructor.constructor.name ? (
            x + xs
        ) : ( // cons(x)(Generator)
            function* () {
                yield x;
                let nxt = xs.next();
                while (!nxt.done) {
                    yield nxt.value;
                    nxt = xs.next();
                }
            }
        )();


    // constant :: a -> b -> a
    const constant = k =>
        _ => k;


    // first :: (a -> b) -> ((a, c) -> (b, c))
    const first = f =>
        // A simple function lifted to one which applies
        // to a tuple, transforming only its first item.
        xy => Tuple(f(xy[0]))(
            xy[1]
        );


    // fst :: (a, b) -> a
    const fst = tpl =>
        // First member of a pair.
        tpl[0];


    // list :: StringOrArrayLike b => b -> [a]
    const list = xs =>
        // xs itself, if it is an Array,
        // or an Array derived from xs.
        Array.isArray(xs) ? (
            xs
        ) : Array.from(xs || []);


    // map :: (a -> b) -> [a] -> [b]
    const map = f =>
        // The list obtained by applying f
        // to each element of xs.
        // (The image of xs under f).
        xs => [...xs].map(f);


    // ne :: a -> a -> Bool
    const ne = a =>
        b => a !== b;


    // pureList :: a -> [a]
    const pureList = x => [x];


    // span p xs is equivalent to (takeWhile p xs, dropWhile p xs) 
    // span :: (a -> Bool) -> [a] -> ([a], [a])
    const span = p =>
        // Longest prefix of xs consisting of elements which
        // all satisfy p, tupled with the remainder of xs.
        xs => {
            const
                ys = 'string' !== typeof xs ? (
                    list(xs)
                ) : xs,
                iLast = ys.length - 1;
            return splitAt(
                until(
                    i => iLast < i || !p(ys[i])
                )(i => 1 + i)(0)
            )(ys);
        };


    // splitAt :: Int -> [a] -> ([a], [a])
    const splitAt = n =>
        xs => Tuple(xs.slice(0, n))(
            xs.slice(n)
        );


    // unlines :: [String] -> String
    const unlines = xs =>
        // A single string formed by the intercalation
        // of a list of strings with the newline character.
        xs.join('\n');


    // until :: (a -> Bool) -> (a -> a) -> a -> a
    const until = p =>
        f => x => {
            let v = x;
            while (!p(v)) v = f(v);
            return v;
        };

    // MAIN ---
    return main();
})();
Output:
[
  "one|uno",
  "",
  "three^^",
  "four^|cuatro",
  "",
  ""
]

jq[edit]

Works with: jq version 1.5
# Tokenize the input using the string "escape" as the prefix escape string
def tokenize(separator; escape):

  # Helper functions:
  # mapper/1 is like map/1, but for each element, $e, in the input array,
  # if $e is an array, then it is inserted,
  # otherwise the elements of ($e|f) are inserted.
  def mapper(f): reduce .[] as $e
    ( [];
      if ($e|type) == "array" then . + [$e] else . + ($e | f) end ) ;
  
  # interpolate x
  def interpolate(x):
    reduce .[] as $i ([]; . +  [$i, x]) | .[0:-1];
  
  def splitstring(s; twixt):
    if type == "string" then split(s) | interpolate(twixt)
    else .
    end;

  # concatenate sequences of non-null elements:
  def reform:
    reduce .[] as $x ([];
      if $x == null and .[-1] == null then .[0:-1] + ["", null]
      elif $x == null then . + [null]
      elif .[-1] == null then .[0:-1] + [$x]
      else .[0:-1] +  [ .[-1] + $x ]
      end)
    | if .[-1] == null then .[-1] = "" else . end;
  
  splitstring(escape + escape; [escape])
  | mapper( splitstring( escape + separator; [separator]) )
  | mapper( splitstring( separator; null ) )
  | map( if type == "string" then split(escape) else . end)
  | flatten
  | reform ;

Example:

"one^|uno||three^^^^|four^^^|^cuatro|" | tokenize("|"; "^")
Output:
$ jq -n -f tokenize.jq
[
  "one|uno",
  "",
  "three^^",
  "four^|cuatro",
  ""
]

Julia[edit]

Works with: Julia version 0.6
Translation of: Kotlin
function tokenize2(s::AbstractString, sep::Char, esc::Char)
    SPE = "\ufffe"
    SPF = "\uffff"
    s = replace(s, "$esc$esc", SPE) |>
        s -> replace(s, "$esc$sep", SPF) |>
        s -> last(s) == esc ? string(replace(s[1:end-1], esc, ""), esc) : replace(s, esc, "")
    return map(split(s, sep)) do token
        token = replace(token, SPE, esc)
        return replace(token, SPF, sep)
    end
end

@show tokenize2("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^')
Output:
tokenize2("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^') = String["one|uno", "", "three^^", "four^|cuatro", ""]

Kotlin[edit]

// version 1.1.3

const val SPE = "\ufffe"  // unused unicode char in Specials block
const val SPF = "\uffff"  // ditto

fun tokenize(str: String, sep: Char, esc: Char): List<String> {
    var s = str.replace("$esc$esc", SPE).replace("$esc$sep", SPF)
    s = if (s.last() == esc) // i.e. 'esc' not escaping anything
        s.dropLast(1).replace("$esc", "") + esc
    else 
        s.replace("$esc", "")
    return s.split(sep).map { it.replace(SPE, "$esc").replace(SPF, "$sep") }
}

fun main(args: Array<String>) {
    var str = "one^|uno||three^^^^|four^^^|^cuatro|"
    val sep = '|'
    val esc = '^'
    val items = tokenize(str, sep, esc) 
    for (item in items) println(if (item.isEmpty()) "(empty)" else item)
}
Output:
one|uno
(empty)
three^^
four^|cuatro
(empty)

Lingo[edit]

-- in some movie script

on tokenize (str, sep, esc)
  l = []
  _player.itemDelimiter = sep
  cnt = str.item.count
  repeat with i = 1 to cnt
    prev = l.getLast() -- can be VOID
    if _trailEscCount(prev, esc) mod 2 then
      l[l.count] = prev.char[1..prev.length-1]&sep&str.item[i]
    else
      l.add(str.item[i])
    end if
  end repeat
  -- remove escape characters from tokens
  cnt = l.count
  repeat with i = 1 to cnt
    l[i] = _removeEsc(l[i], esc)
  end repeat
  return l
end

-- counts number of trailing escape characters
on _trailEscCount (str, esc)
  n = 0
  repeat with i = str.length down to 1
    if str.char[i]=esc then n=n+1
    else exit repeat
  end repeat
  return n
end

-- could be implemented more efficiently by using offset()
on _removeEsc (str, esc)
  cnt = str.length-1
  repeat with i = 1 to cnt
    if str.char[i]=esc then
      delete char i of str
      cnt = cnt-1
    end if
  end repeat
  return str
end
str = "one^|uno||three^^^^|four^^^|^cuatro|"
sep = "|"
esc = "^"
put tokenize(str, sep, esc)
-- ["one|uno", "", "three^^", "four^|cuatro", ""]

Lua[edit]

function tokenise (str, sep, esc)
    local strList, word, escaped, ch = {}, "", false
    for pos = 1, #str do
        ch = str:sub(pos, pos)
        if ch == esc then
            if escaped then
                word = word .. ch
                escaped = false
            else
                escaped = true
            end
        elseif ch == sep then
            if escaped then
                word = word .. ch
                escaped = false
            else
                table.insert(strList, word)
                word = ""
            end
        else
            escaped = false
            word = word .. ch
        end
    end
    table.insert(strList, word)
    return strList
end

local testStr = "one^|uno||three^^^^|four^^^|^cuatro|"
local testSep, testEsc = "|", "^"
for k, v in pairs(tokenise(testStr, testSep, testEsc)) do
    print(k, v)
end
Output:
1       one|uno
2
3       three^^
4       four^|cuatro
5

Mathematica / Wolfram Language[edit]

ClearAll[Tokenize]
Tokenize[str_String, escape_String : "^", sep_String : "|"] := 
 Module[{results = {}, token = "", state = 0, a},
  a = Characters[str];
  Do[
   If[state == 0,
    Switch[c,
     escape,
     state = 1
     ,
     sep,
     AppendTo[results, token];
     token = "";
     ,
     _,
     token = token <> c;
     ]
    ,
    If[state == 1,
     token = token <> c;
     state = 0;
     ]
    ]
   
   ,
   {c, a}
   ];
  AppendTo[results, token];
  results
 ]
Tokenize["one^|uno||three^^^^|four^^^|^cuatro|"]
Output:
{"one|uno", "", "three^^", "four^|cuatro", ""}

Nim[edit]

import streams

proc tokenize(s: Stream, sep: static[char] = '|', esc: static[char] = '^'): seq[string] =
  var buff = ""
  while not s.atEnd():
    let c = s.readChar
    case c
    of sep:
      result.add buff
      buff = ""
    of esc:
      buff.add s.readChar
    else:
      buff.add c
  result.add buff

for i, s in tokenize(newStringStream "one^|uno||three^^^^|four^^^|^cuatro|"):
    echo i, ":", s
Output:
0:one|uno
1:
2:three^^
3:four^|cuatro
4:

OCaml[edit]

let split_with_escaping ~esc ~sep s =
  let len = String.length s in
  let buf = Buffer.create 16 in
  let rec loop i =
    if i = len then [Buffer.contents buf]
    else if s.[i] = esc && i + 1 < len then begin
      Buffer.add_char buf s.[i + 1];
      loop (i + 2)
    end else if s.[i] = sep then begin
      let s = Buffer.contents buf in
      Buffer.clear buf;
      s :: loop (i + 1)
    end else begin
      Buffer.add_char buf s.[i];
      loop (i + 1)
    end
  in
  loop 0

Example:

let res = split_with_escaping ~esc:'^' ~sep:'|' "one^|uno||three^^^^|four^^^|^cuatro|";;
val res : string list = ["one|uno"; ""; "three^^"; "four^|cuatro"; ""]

Perl[edit]

The built-in split function can be used with a regex that matches the delimiter (although advanced backtracking control verbs are needed to skip escaped delimiters):

sub tokenize {
    my ($string, $sep, $esc) = (shift, quotemeta shift, quotemeta shift);
    
    my @fields = split /$esc . (*SKIP)(*FAIL) | $sep/sx, $string, -1;
    return map { s/$esc(.)/$1/gsr } @fields;
}

A more traditional approach is to parse the input string step by step (using a repeatedly-matching regex of the form /\G.../g), and throw away the separators (which can be done implicitly using \K):

    my @fields = $string =~ /\G (?:^ | $sep) \K (?: [^$sep$esc] | $esc .)*/gsx;

In both cases, stripping the escape characters happens as a separate step.

Testing:

print "'$_'\n" for tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^');
Output:
'one|uno'
''
'three^^'
'four^|cuatro'
''

Phix[edit]

function tokenize(string s, integer sep, integer esc)
sequence ret = {}
string word = ""
integer skip = 0
 
    if length(s)!=0 then
        for i=1 to length(s) do
            integer si = s[i]
            if skip then
                word &= si
                skip = 0
            elsif si=esc then
                skip = 1
            elsif si=sep then
                ret = append(ret,word)
                word = ""
            else
                word &= si
            end if
        end for
        ret = append(ret,word)
    end if
    return ret
end function
 
?tokenize("one^|uno||three^^^^|four^^^|^cuatro|",'|','^')
Output:
{"one|uno","","three^^","four^|cuatro",""}

PicoLisp[edit]

(de tokenize (Str Sep Esc)
   (split
      (make
         (for (L (chop Str)  L)
            (let C (pop 'L)
               (cond
                  ((= C Esc) (link (pop 'L)))
                  ((= C Sep) (link 0))
                  (T (link C)) ) ) ) )
      0 ) )

Test:

(for (I . S) (tokenize "one\^|uno||three\^\^\^\^|four\^\^\^|\^cuatro|" "|" "\^")
   (prinl I ": " S) )

Output:

1: one|uno
2:
3: three^^
4: four^|cuatro
5:

PowerShell[edit]

function Split-String ([string]$String, [char]$Separator, [char]$Escape)
{
    if ($String -notmatch "\$Separator|\$Escape") {return $String}

    [bool]$escaping = $false
    [string]$output = ""

    for ($i = 0; $i -lt $String.Length; $i++)
    { 
        [char]$character = $String.Substring($i,1)

        if ($escaping)
        {
            $output += $character
            $escaping = $false
        }
        else
        {
            switch ($character)
            {
                {$_ -eq $Separator} {$output; $output = ""; break}
                {$_ -eq $Escape}    {$escaping = $true    ; break}
                Default             {$output += $character}
            }
        }
    }

    if ($String[-1] -eq $Separator) {[String]::Empty}
}
Split-String "one^|uno||three^^^^|four^^^|^cuatro|" -Separator "|" -Escape "^" | ForEach-Object `
                                                                                        -Begin   {$n = 0} `
                                                                                        -Process {$n+= 1; "{0}: {1}" -f $n, $_}
Output:
1: one|uno
2: 
3: three^^
4: four^|cuatro
5: 

Python[edit]

Procedural[edit]

def token_with_escape(a, escape = '^', separator = '|'):
    '''
        Issue  python -m doctest thisfile.py  to run the doctests.

        >>> print(token_with_escape('one^|uno||three^^^^|four^^^|^cuatro|'))
        ['one|uno', '', 'three^^', 'four^|cuatro', '']
    '''
    result = []
    token = ''
    state = 0
    for c in a:
        if state == 0:
            if c == escape:
                state = 1
            elif c == separator:
                result.append(token)
                token = ''
            else:
                token += c
        elif state == 1:
            token += c
            state = 0
    result.append(token)
    return result

Functional[edit]

Works with: Python version 3
'''Tokenize a string with escaping'''

from functools import reduce


# tokenize :: Char -> Char -> String -> [String]
def tokenize(delim):
    '''A list of the tokens in a string, given
       a delimiting char and an escape char.
    '''
    def go(esc, s):
        def chop(a, x):
            tkn, xs, escaped = a
            literal = not escaped
            isEsc = literal and (esc == x)
            return ([], [tkn] + xs, isEsc) if (
                literal and (delim == x)
            ) else (tkn if isEsc else [x] + tkn, xs, isEsc)

        tkn, xs, _ = reduce(chop, list(s), ([], [], False))

        return list(reversed(
            [''.join(reversed(x)) for x in [tkn] + xs]
        ))
    return lambda esc: lambda s: go(esc, s)


# --------------------------TEST---------------------------
# main :: IO ()
def main():
    '''Test'''
    
    print(
        tokenize('|')('^')(
            "one^|uno||three^^^^|four^^^|^cuatro|"
        )
    )


# MAIN ---
if __name__ == '__main__':
    main()
Output:
['one|uno', '', 'three^^', 'four^|cuatro', '']

Regex-based[edit]

Using Scanner[edit]

The python re library has a handy class Scanner which is intended precisely for this use-case. It takes a list of pairs regex, action and whenever it encounters regex in the input, it executes action. This allows us to solve this task very efficiently with minimum effort, the hardest part being the correct definition of the regular expressions.

The following code also illustrates an important feature of Python ‒ nested functions with closures. Owing to this feature, the inner functions, such as start_new_token, are able to access the local variable tokens of their enclosing function tokenize. For the inner function, the name tokens is nonlocal, and is in the enclosing scope of the inner function (as opposed to the parameters scanner and substring, which are in the local scope).

import re

STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'

def tokenize(string=STRING, escape='^', separator='|'):

    escape, separator = map(re.escape, (escape, separator))

    tokens = ['']

    def start_new_token(scanner, substring):
        tokens.append('')

    def add_escaped_char(scanner, substring):
        char = substring[1]
        tokens[-1] += char

    def add_substring(scanner, substring):
        tokens[-1] += substring

    re.Scanner([
        # an escape followed by a character produces that character
        (fr'{escape}.', add_escaped_char),

        # when encountering a separator not preceded by an escape,
        # start a new token
        (fr'{separator}', start_new_token),

        # a sequence of regular characters (i.e. not escape or separator)
        # is just appended to the token
        (fr'[^{escape}{separator}]+', add_substring),
    ]).scan(string)

    return tokens


if __name__ == '__main__':
    print(list(tokenize()))

Output is the same as in the functional Python version above.


Simpler version with preprocessing[edit]

This version does not require any extra state, such as the token list in the Scanner-based version above. It first preprocesses the input, since Python does not support variable-length lookbehind assertions. Then it works only with the primitive regex operations re.findall and re.sub. Note that the regex used here is compiled with the re.VERBOSE flag. This allows us to write the regex on several lines (since unescaped whitespace is ignored in this mode), and use comments inside the regex (starting with #).

import re

STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'

def tokenize(string=STRING, escape='^', separator='|'):

    re_escape, re_separator = map(re.escape, (escape, separator))

    # token regex
    regex = re.compile(fr'''
        # lookbehind: a token must be preceded by a separator
        # (note that `(?<=^|{re_separator})` doesn't work in Python)
        (?<={re_separator})

        # a token consists either of an escape sequence,
        # or a regular (non-escape, non-separator) character,
        # repeated arbitrarily many times (even zero)
        (?:{re_escape}.|[^{re_escape}{re_separator}])*
      ''',
      flags=re.VERBOSE
    )

    # since each token must start with a separator,
    # we must add an extra separator at the beginning of input
    preprocessed_string = separator + string

    for almost_token in regex.findall(preprocessed_string):
      # now get rid of escape characters: '^^' -> '^' etc.
      token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
      yield token

if __name__ == '__main__':
    print(list(tokenize()))

Racket[edit]

#lang racket/base
(require racket/match)

;; Returns a tokenising function based on sep and esc
(define ((tokenise-with-escape sep esc) str)
  (define tsil->string (compose list->string reverse))
  (define (inr rem l-acc acc)
    (match rem
      ['() (if (and (null? acc) (null? l-acc)) null (reverse (cons (tsil->string l-acc) acc)))]
      [(list (== sep)   tl ...) (inr tl null (cons (tsil->string l-acc) acc))]
      [(list (== esc) c tl ...) (inr tl (cons c l-acc) acc)]
      [(list c          tl ...) (inr tl (cons c l-acc) acc)]))
  (inr (string->list str) null null))

;; This is the tokeniser that matches the parameters in the task
(define task-tokeniser (tokenise-with-escape #\| #\^))

(define (report-input-output str)
  (printf "Input:  ~s~%Output: ~s~%~%" str (task-tokeniser str)))

(report-input-output "one^|uno||three^^^^|four^^^|^cuatro|")
(report-input-output "")
(report-input-output "|")
(report-input-output "^")
(report-input-output ".")
Output:
Input:  "one^|uno||three^^^^|four^^^|^cuatro|"
Output: ("one|uno" "" "three^^" "four^|cuatro" "")

Input:  ""
Output: ()

Input:  "|"
Output: ("" "")

Input:  "^"
Output: ("^")

Input:  "."
Output: (".")

Raku[edit]

(formerly Perl 6)

sub tokenize ($string, :$sep!, :$esc!) {
    return $string.match(/([ <!before $sep | $esc> . | $esc . ]*)+ % $sep/)\
                  .[0].map(*.subst: /$esc )> ./, '', :g);
}

say "'$_'" for tokenize 'one^|uno||three^^^^|four^^^|^cuatro|', sep => '|', esc => '^';
Output:
'one|uno'
''
'three^^'
'four^|cuatro'
''

Notable Raku innovations that make this different from the equivalent #Perl solution:

  • string variables can be safely interpolated into regexes without having to 'quotemeta' them
  • regexes matches return a nested Match object which allows retrieving all results for a given capture group (rather than just the last thing that it matched), thus getting rid of the need for repeated global matching
  • the <field>+ % <delimiter> regex construct allows handling the delimiters in a more idiomatic way
  • the )> regex construct can be used to exclude anything that follows it from the returned match result

REXX[edit]

IF/THEN logic[edit]

/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
  str = 'one^|uno||three^^^^|four^^^|^cuatro|'   /*the character string to be tokenized.*/
  esc = '^'                                      /* "    escape  character to be used.  */
  sep = '|'                                      /* "  separator     "      "  "   "    */
  out =                                          /* "  output string  (so far).         */
eMode = 0                                        /*a flag,  escape is in progress.      */

  do j=1  for length(str);  _=substr(str, j, 1)  /*parse a single character at a time.  */
  if eMode   then do; out=out || _;  eMode=0;  iterate;  end   /*are we in escape mode? */
  if _==esc  then do;                eMode=1;  iterate;  end   /*is it an escape char ? */
  if _==sep  then do; call show;               iterate;  end   /* "  " a separator char?*/
  out=out || _                                                 /*append the character.  */
  end   /*j*/

if out\=='' | _==sep  then call show             /*handle a residual str or a separator.*/
exit                                             /*stick a fork in it,  we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show:  say  '[length'right(length(out),4)"]"   out;             out=;               return

output

[length   7] one|uno
[length   0]
[length   7] three^^
[length  12] four^|cuatro
[length   0]

SELECT logic[edit]

This REXX version also shows a scale in the output.

/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
  str = 'one^|uno||three^^^^|four^^^|^cuatro|'   /*the character string to be tokenized.*/
  esc = '^'                                      /* "    escape  character to be used.  */
  sep = '|'                                      /* "  separator     "      "  "   "    */
  $   =                                          /* "  output string  (so far).         */
eMode = 0                                        /*a flag,  escape is in progress.      */
say ' output len        output'                  /*title  verbiage  used for the output.*/
say '──────────── ────────────────────'          /*  "    separator   "   "   "     "   */

  do j=1  for length(str);  _=substr(str, j, 1)  /*parse a single character at a time.  */
      select
      when eMode   then do; $=$ || _;  eMode=0; end      /*are we in in escape  mode?   */
      when _==esc  then                eMode=1           /*is it an  escape  character? */
      when _==sep  then do; call show;          end      /* "  " a separator character? */
      otherwise             $=$ || _                     /*append the single character. */
      end   /*select*/
  end       /*j*/

if $\=='' | _==sep  then call show               /*handle a residual str or a separator.*/
say '──────────── ────────────────────'          /*the foot separator for the output.   */
say '             ····^····1····^····2'          /*show the    top    part of the scale.*/
say '  {scale}    12345678901234567890'          /*  "   "    bottom    "   "  "    "   */
exit                                             /*stick a fork in it,  we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show:  say  '[length'right(length($),4)"]"   $;                 $=;                 return

output

 output len        output
──────────── ────────────────────
[length   7] one|uno
[length   0]
[length   7] three^^
[length  12] four^|cuatro
[length   0]
──────────── ────────────────────
             ····^····1····^····2
  {scale}    12345678901234567890

Ring[edit]

tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")

func tokenize(src, sep, esc)
field = 1
escaping = false
see "" + field + " "
for i = 1 to len(src)
    char = substr(src, i, 1)
    if escaping 
       see char
       escaping = false
    else
       switch char 
              on sep
                 see nl
                 field = field + 1
                 see "" + field + " "
              on esc
                 escaping = true
              other
                 see char 
       off
    ok
next
see nl

Output:

1 one|uno
2 
3 three^^
4 four^|cuatro
5 

Ruby[edit]

I had to drop the \K flag and instead drop the seperator at the beginning manually. I am not sure if \K is broken or works different than Perl

Translation of: Perl
def tokenize(string, sep, esc)
  sep = Regexp.escape(sep)
  esc = Regexp.escape(esc)
  string.scan(/\G (?:^ | #{sep}) (?: [^#{sep}#{esc}] | #{esc} .)*/x).collect do |m|
    m.gsub(/#{esc}(.)/, '\1').gsub(/^#{sep}/, '')
  end
end

p tokenize('one^|uno||three^^^^|four^^^|^cuatro|', '|', '^')

Rust[edit]

const SEPARATOR: char = '|';
const ESCAPE: char = '^';
const STRING: &str = "one^|uno||three^^^^|four^^^|^cuatro|";

fn tokenize(string: &str) -> Vec<String> {
    let mut token = String::new();
    let mut tokens: Vec<String> = Vec::new();
    let mut chars = string.chars();
    while let Some(ch) = chars.next() {
        match ch {
            SEPARATOR => {
                tokens.push(token);
                token = String::new();
            },
            ESCAPE => {
                if let Some(next) = chars.next() {
                    token.push(next);
                }
            },
            _ => token.push(ch),
        }
    }
    tokens.push(token);
    tokens
}

fn main() {
    println!("{:#?}", tokenize(STRING));
}
Output:
[
    "one|uno",
    "",
    "three^^",
    "four^|cuatro",
    "",
]

Scala[edit]

Old fashioned Imperative[edit]

Imperative with removed (ugly) mutable variables.

Translation of: Kotlin
object TokenizeStringWithEscaping0 extends App {

  val (markerSpE,markerSpF) = ("\ufffe" , "\uffff")

  def tokenize(str: String, sep: String, esc: String): Array[String] = {

    val s0 = str.replace( esc + esc, markerSpE).replace(esc + sep, markerSpF)
    val s = if (s0.last.toString == esc) s0.replace(esc, "") + esc else s0.replace(esc, "")
    s.split(sep.head).map (_.replace(markerSpE, esc).replace(markerSpF, sep))
  }

  def str = "one^|uno||three^^^^|four^^^|^cuatro|"

  tokenize(str, "|", "^").foreach(it => println(if (it.isEmpty) "<empty token>" else it))
}

Idiomatic[edit]

Functional with Tail recursion[edit]

import scala.annotation.tailrec

object TokenizeStringWithEscaping1 extends App {

  def tokenize(str: String, sep: String, esc: String): Seq[String] = {
    @tailrec
    def loop(accu: Seq[String], s: String): Seq[String] = {
      def append2StringInList(char: String): Seq[String] =
        accu.init :+ (accu.last + char)

      s.length match {
        case 0 => accu
        case 1 => if (s.head.toString == sep) accu :+ "" else append2StringInList(s)
        case _ => (s.head.toString, s.tail.head.toString) match {
          case c@((`esc`, `sep`) | (`esc`, `esc`)) => loop(append2StringInList(c._2), s.tail.tail)
          case (`sep`, _)                          => loop(accu :+ "", s.tail)
          case (`esc`, _)                          => loop(accu, s.tail)
          case (sub, _)                            => loop(append2StringInList(sub.head.toString), s.tail)
        }
      }
    }

    loop(Seq(""), str)
  }

  def str = "one^|uno||three^^^^|four^^^|^cuatro|"

  tokenize(str, "|", "^")
    .foreach(it =>
      println(
        f"[length:${it.length}%3d] ${if (it.isEmpty) "<empty token>" else it}"))
}
Output:
See it in running in your browser by ScalaFiddle (JavaScript) or by Scastie (JVM).

Sidef[edit]

Translation of: Perl
func tokenize(string, sep, esc) {
    var fields = string.split(
        Regex(esc.escape + '.(*SKIP)(*FAIL)|' + sep.escape, 's'), -1
    )
    fields.map{.gsub(Regex(esc.escape + '(.)'), {|s1| s1 }) }
}

tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^').each { |str|
    say str.dump
}
Output:
"one|uno"
""
"three^^"
"four^|cuatro"
""

Simula[edit]

SIMSET
BEGIN

    LINK CLASS ITEM(TXT); TEXT TXT;;

    REF(HEAD) PROCEDURE SPLIT(TXT, SEP, ESC); TEXT TXT; CHARACTER SEP, ESC;
    BEGIN
        REF(HEAD) PARTS;
        CHARACTER CH;
        TEXT PART;

        PART :- BLANKS(TXT.LENGTH);
        PARTS :- NEW HEAD;
        TXT.SETPOS(1);
        WHILE TXT.MORE DO BEGIN
            CH := TXT.GETCHAR;
            IF CH = ESC THEN BEGIN
                IF TXT.MORE THEN BEGIN
                    CH := TXT.GETCHAR;
                    PART.PUTCHAR(CH);
                END ELSE BEGIN
                    ERROR("SPLIT: ESCAPE CHAR AT END OF STRING");
                END;
            END ELSE IF CH = SEP THEN BEGIN
                NEW ITEM(COPY(PART.SUB(1,PART.POS-1))).INTO(PARTS);
                PART.SETPOS(1);
            END ELSE BEGIN
                PART.PUTCHAR(CH);
            END;
        END;
        NEW ITEM(COPY(PART.SUB(1,PART.POS-1))).INTO(PARTS);

        SPLIT :- PARTS;
    END SPLIT;

    TEXT EXAMPLE;
    REF(HEAD) RESULT;
    REF(ITEM) PART;
    INTEGER NO;

    FOR EXAMPLE :- "ONE^|UNO||THREE^^^^|FOUR^^^|^CUATRO|" DO
    BEGIN
        OUTTEXT("INPUT: '");
        OUTTEXT(EXAMPLE);
        OUTTEXT("'");
        OUTIMAGE;
        RESULT :- SPLIT(EXAMPLE, '|', '^');
        PART :- RESULT.FIRST;
        NO := 0;
        WHILE PART =/= NONE DO
        BEGIN
            NO := NO + 1;
            OUTTEXT("PART");
            OUTINT(NO, 0);
            OUTTEXT(": '");
            OUTTEXT(PART.TXT);
            OUTTEXT("'");
            OUTIMAGE;
            PART :- PART.SUC;
        END;
    END;

END.
Output:
INPUT: 'ONE^|UNO||THREE^^^^|FOUR^^^|^CUATRO|'
PART1: 'ONE|UNO'
PART2: ''
PART3: 'THREE^^'
PART4: 'FOUR^|CUATRO'
PART5: ''

SNOBOL4[edit]

* Program: tokenize_with_escape.sbl
* To run: sbl tokenize_with_escape.sbl
* Description: Tokenize a string with escaping
* Comment: Tested using the Spitbol for Linux version of SNOBOL4

	lf = substr(&alphabet,11,1) ;* New line or line feed


* Function tokenize will break parts out of a string, which are
* separated by c, which defaults to a comma, into
* an array. Parameter kp=1 to keep null parts, which is the default,
* and 0 to discard.
	define('tokenize(s,c,kp)tokenizepat,part,t,i,j')
	:(tokenize_end)
tokenize
	c = (ident(c) ',', substr(c,1,1)) :f(freturn)
	kp = (ident(kp) 1, eq(kp,0) 0, 1) :f(freturn)
	t = table()
	tokenizepat = breakx(c) . part c | (len(1) rem) . part
	s ? eq(kp,1) rtab(1) c = s c
tokenize1
	s ? tokenizepat = "" :f(tokenize2)
	t[i = eq(kp,0) differ(part) i + 1] = part
	t[i = eq(kp,1) i + 1] = part
	:(tokenize1)
tokenize2
	tokenize = array(i) :f(errr)
	j = 0
tokenize3	tokenize[j = lt(j,i) j + 1] = t[j] :s(tokenize3)
	:(return)
tokenize_end


* Function tokcan will a normalize a string by applying separator and escape
* rules to string ts. Parameter sep is the separator, while esc is the escape
* character. Parameter tesc is the new separator character to substitute for
* parameter sep. It defaults to a comma, ",".
	define('tokcan(ts,sep,esc,tesc)tpat,part1,part2,notany') :(tokcan_end)
tokcan
	tesc = (ident(tesc) ',', substr(tesc,1,1))
	tpat = (breakx(sep esc) . part1
+		(sep | esc sep | esc esc | (esc len(1) . notany)) . part2
+		)
+		| (len(1) rem) . part1

tokcan1
	ts ? tpat = :f(tokcan2)
	part2 = (leq(part2,sep) tesc
+		,leq(part2,esc sep) sep
+		,leq(part2,esc esc) esc
+		,differ(notany) leq(part2,esc notany) notany
+		)
	tokcan = (ident(tokcan) "", tokcan) part1 part2
	:(tokcan1)
tokcan2
	:(return)
tokcan_end


	test_string = "one^|uno||three^^^^|four^^^|^cuatro|"
	sep = "|"
	esc = "^"

	hline = tokcan(test_string,sep,esc) :f(err)


	output = "  Input: " test_string lf
	output = "Output1: " hline lf

	output = "Output2: "
	tokenized = tokenize(hline,",")

p1	output = "'" tokenized[z = z + 1] "'" :s(p1)

END
Output:
  Input: one^|uno||three^^^^|four^^^|^cuatro|

Output1: one|uno,,three^^,four^|cuatro,

Output2: 
'one|uno'
''
'three^^'
'four^|cuatro'
''

Swift[edit]

Translation of: Rust
extension String {
  func tokenize(separator: Character, escape: Character) -> [String] {
    var token = ""
    var tokens = [String]()
    var chars = makeIterator()

    while let char = chars.next() {
      switch char {
      case separator:
        tokens.append(token)
        token = ""
      case escape:
        if let next = chars.next() {
          token.append(next)
        }
      case _:
        token.append(char)
      }
    }

    tokens.append(token)

    return tokens
  }
}

print("one^|uno||three^^^^|four^^^|^cuatro|".tokenize(separator: "|", escape: "^"))
Output:
["one|uno", "", "three^^", "four^|cuatro", ""]

Tcl[edit]

Putting a coroutine in a TclOO object following the "generator pattern" gives a nice structure:

oo::class create tokens {
    constructor {s} {
        puts [coroutine Next my Iter $s]
        oo::objdefine [self] forward next Next
    }
    method Iter {s} {
        yield [info coroutine]
        for {set i 0} {$i < [string length $s]} {incr i} {
            yield [string index $s $i]
        }
        return -code break
    }
}

proc tokenize {s {sep |} {escape ^}} {
    set part ""
    set parts ""
    set iter [tokens new $s]
    while {1} {
        set c [$iter next]
        if {$c eq $escape} {
            append part [$iter next]
        } elseif {$c eq $sep} {
            lappend parts $part
            set part ""
        } else {
            append part $c
        }
    }
    lappend parts $part
    return $parts
}

puts [tokenize one^|uno||three^^^^|four^^^|^cuatro| | ^]
Output:
one|uno {} three^^ four^|cuatro {}

TMG[edit]

Unix TMG:

prog:   char(sep) *
        char(esc) *
str:    smark
token:  forw/outp
        ( [ch==esc?] char(ch) any(!<<>>) token
        | [ch==sep?] char(ch) outp str
        | any(!<<>>) token );
outp:   parse(( scopy = { <"> 1 <"> * } ));
forw:   peek/chkeof;
peek:   [ch=0] char(ch) fail;
chkeof: ( [ch?] succ | fail );

ch:     0;
sep:    0;
esc:    0;

Input:

|
^
one^|uno||three^^^^|four^^^|^cuatro|

Output:

"one|uno"
""
"three^^"
"four^|cuatro"
""

VBA[edit]

Translation of: Phix
Private Function tokenize(s As String, sep As String, esc As String) As Collection
    Dim ret As New Collection
    Dim this As String
    Dim skip As Boolean
     
    If Len(s) <> 0 Then
        For i = 1 To Len(s)
            si = Mid(s, i, 1)
            If skip Then
                this = this & si
                skip = False
            Else
                If si = esc Then
                    skip = True
                Else
                    If si = sep Then
                        ret.Add this
                        this = ""
                    Else
                        this = this & si
                    End If
                End If
            End If
        Next i
        ret.Add this
    End If
    Set tokenize = ret
End Function

Public Sub main()
    Dim out As Collection
    Set out = tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
    Dim outstring() As String
    ReDim outstring(out.Count - 1)
    For i = 0 To out.Count - 1
        outstring(i) = out(i + 1)
    Next i
    Debug.Print Join(outstring, ", ")
End Sub
Output:
one|uno, , three^^, four^|cuatro, 

Vlang[edit]

Translation of: Go
fn tokenize_string(s string, sep u8, escape u8) ?[]string {
    mut tokens := []string{}
	mut runes := []u8{}
	mut in_escape := false
	for r in s {
		if in_escape {
			in_escape = false
		    runes << r
        } else if r == escape {
			in_escape = true
        } else if r == sep {
			tokens << runes.bytestr()
			runes = runes[..0]
		} else {
            runes << r
        }
	}
	tokens << runes.bytestr()
	if in_escape {
		return error("invalid terminal escape")
	}
	return tokens
}

const sample = "one^|uno||three^^^^|four^^^|^cuatro|"
const separator = `|`
const escape = `^`
fn main() {
	println("Input:   $sample")
	tokens := tokenize_string(sample, separator, escape)?
	println("Tokens: $tokens")
}
Output:
Input:   one^|uno||three^^^^|four^^^|^cuatro|
Tokens: ['one|uno', '', 'three^^', 'four^|cuatro', '']
)

Wren[edit]

Translation of: Kotlin
var SPE = "\ufffe"  // unused unicode character in Specials block
var SPF = "\uffff"  // ditto

var tokenize = Fn.new { |str, sep, esc| 
    str = str.replace(esc + esc, SPE).replace(esc + sep, SPF)
    str = (str[-1] == esc) ? str[0...-1].replace(esc, "") + esc : str.replace(esc, "")
    return str.split(sep).map { |s| s.replace(SPE, esc).replace(SPF, sep) }.toList
}

var str = "one^|uno||three^^^^|four^^^|^cuatro|"
var sep = "|"
var esc = "^"
var items = tokenize.call(str, sep, esc)
for (item in items) System.print((item == "") ? "(empty)" : item)
Output:
one|uno
(empty)
three^^
four^|cuatro
(empty)

zkl[edit]

Two simplifying assumptions (since their behavior is undefined): A string ending with an un-escaped escape is an error and 0xff is not an allowed character in the string.

fcn tokenize(str,sep,esc){
   sink:=Sink(String);
   foreach c in (str){
      switch(c){
         case(esc){ sink.write(__cWalker.next()); }  // error if ^EoS
	 case(sep){ sink.write("\xff"); }
	 else     { sink.write(c) }
      }
   }
   sink.close().split("\xff");
}

Or, if you prefer brevity:

fcn tokenize(str,sep,esc){
   sink:=Sink(String);
   foreach c in (str){ sink.write( (c==esc and __cWalker.next()) or (c==sep and "\xff") or c ) }
   sink.close().split("\xff");
}
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|","^").println();
Output:
L("one|uno","","three^^","four^|cuatro","")