Multisplit: Difference between revisions
(→{{header|Python}}: Add RE version) |
m (→Using Regular expressions: handle edge cases consistently.) |
||
Line 51: | Line 51: | ||
<lang python>>>> import re |
<lang python>>>> import re |
||
>>> def ms2(txt="a!===b=!=c", sep=["==", "!=", "="]): |
>>> def ms2(txt="a!===b=!=c", sep=["==", "!=", "="]): |
||
if not txt or not sep: |
|||
⚫ | |||
return [] |
|||
⚫ | |||
for m in re.finditer('(.*?)(?:' + '|'.join('('+re.escape(s)+')' for s in sep) + ')', txt): |
for m in re.finditer('(.*?)(?:' + '|'.join('('+re.escape(s)+')' for s in sep) + ')', txt): |
||
ans += [m.group(1), (m.lastindex-2, m.start(m.lastindex))] |
ans += [m.group(1), (m.lastindex-2, m.start(m.lastindex))] |
||
if txt[m.end(m.lastindex):]: |
if m and txt[m.end(m.lastindex):]: |
||
ans += [txt[m.end(m.lastindex):]] |
ans += [txt[m.end(m.lastindex):]] |
||
return ans |
return ans |
Revision as of 08:21, 28 February 2011
Code to split string with several separators.
Input: string, list of separators
Output: [Sub0, [Sep0Num, Sep0Pos], Sub1, [Sep1Num, Sep1Pos], ..., SubN]
Note: Sub - substring, SepNum - separator number in input list, SepPos - separator position in input string.
Input order of separators is important: they are considered in that order.
J
<lang j>multisplit=:4 :0
'sep begin'=.|:t=. y /:~&.:(|."1)@;@(i.@#@[ ,.L:0"0 I.@E.L:0) x end=. begin + sep { #@>y last=.next=.0 r=.2 0$0 while.next<#begin do. r=.r,.(last}.x{.~next{begin);next{t last=.next{end next=.1 i.~(begin>next{begin)*.begin>:last end. r=.r,.;~last}.x
)</lang>
Explanation:
First find all potentially relevant separator instances, and sort them in increasing order, by starting location and separator index. sep
is separator index, and begin
is starting location. end
is ending location.
Then, loop through the possibilities, skipping over those which conflict with the currently selected sequence.
Example use:
<lang j> S multisplit '==';'!=';'=' ┌───┬───┬───┬───┬─┐ │a │ │b │ │c│ ├───┼───┼───┼───┼─┤ │1 1│0 3│2 6│1 7│ │ └───┴───┴───┴───┴─┘
S multisplit '=';'!=';'=='
┌───┬───┬───┬───┬───┬─┐ │a │ │ │b │ │c│ ├───┼───┼───┼───┼───┼─┤ │1 1│0 3│0 4│0 6│1 7│ │ └───┴───┴───┴───┴───┴─┘
'X123Y' multisplit '1';'12';'123';'23';'3'
┌───┬───┬─┐ │X │ │Y│ ├───┼───┼─┤ │0 1│3 2│ │ └───┴───┴─┘</lang>
Python
Using Regular expressions
<lang python>>>> import re >>> def ms2(txt="a!===b=!=c", sep=["==", "!=", "="]): if not txt or not sep: return [] ans = m = [] for m in re.finditer('(.*?)(?:' + '|'.join('('+re.escape(s)+')' for s in sep) + ')', txt): ans += [m.group(1), (m.lastindex-2, m.start(m.lastindex))] if m and txt[m.end(m.lastindex):]: ans += [txt[m.end(m.lastindex):]] return ans
>>> ms2() ['a', (1, 1), , (0, 3), 'b', (2, 6), , (1, 7), 'c'] >>> ms2(txt="a!===b=!=c", sep=["=", "!=", "=="]) ['a', (1, 1), , (0, 3), , (0, 4), 'b', (0, 6), , (1, 7), 'c']</lang>
Not using RE's
<lang python>>>> def ms(txt="a!===b=!=c", sep=["==", "!=", "="]): size = [len(s) for s in sep] ans, pos0 = [], 0 def getfinds(): return [(-txt.find(s, pos0), -sepnum, size[sepnum]) for sepnum, s in enumerate(sep) if s in txt[pos0:]]
finds = getfinds() while finds: pos, snum, sz = max(finds) pos, snum = -pos, -snum ans += [ txt[pos0:pos], [snum, pos] ] pos0 = pos+sz finds = getfinds() if txt[pos0:]: ans += [ txt[pos0:] ] return ans
>>> ms() ['a', [1, 1], , [0, 3], 'b', [2, 6], , [1, 7], 'c'] >>> ms(txt="a!===b=!=c", sep=["=", "!=", "=="]) ['a', [1, 1], , [0, 3], , [0, 4], 'b', [0, 6], , [1, 7], 'c']</lang> Small inaccuracy in the version above: ms("", ["="]) outputs [] instead of [''].
Alternative version <lang python>def min_pos(List): return List.index(min(List))
def find_all(S, Sub, Start = 0, End = -1, IsOverlapped = 0): Res = [] if End == -1: End = len(S) if IsOverlapped: DeltaPos = 1 else: DeltaPos = len(Sub) Pos = Start while 1: Pos = S.find(Sub, Pos, End) if Pos == -1: break Res.append(Pos) Pos += DeltaPos return Res
def multisplit(S, SepList): SepPosListList = [] SLen = len(S) SepNumList = [] ListCount = 0 for i in range(len(SepList)): Sep = SepList[i] SepPosList = find_all(S, Sep, 0, SLen, IsOverlapped = 1) if SepPosList != []: SepNumList.append(i) SepPosListList.append(SepPosList) ListCount += 1 if ListCount == 0: return [S] MinPosList = [] for i in range(ListCount): MinPosList.append(SepPosListList[i][0]) SepEnd = 0 MinPosPos = min_pos(MinPosList) Res = [] while 1: Res.append( S[SepEnd : MinPosList[MinPosPos]] ) Res.append([SepNumList[MinPosPos], MinPosList[MinPosPos]]) SepEnd = MinPosList[MinPosPos] + len(SepList[SepNumList[MinPosPos]]) while 1: MinPosPos = min_pos(MinPosList) if MinPosList[MinPosPos] < SepEnd: del(SepPosListList[MinPosPos][0]) if len(SepPosListList[MinPosPos]) == 0: del(SepPosListList[MinPosPos]) del(MinPosList[MinPosPos]) del(SepNumList[MinPosPos]) ListCount -= 1 if ListCount == 0: break else: MinPosList[MinPosPos] = SepPosListList[MinPosPos][0] else: break if ListCount == 0: break Res.append(S[SepEnd:]) return Res
S = "a!===b=!=c"
multisplit(S, ["==", "!=", "="]) # output: ['a', [1, 1], , [0, 3], 'b', [2, 6], , [1, 7], 'c']
multisplit(S, ["=", "!=", "=="]) # output: ['a', [1, 1], , [0, 3], , [0, 4], 'b', [0, 6], , [1, 7], 'c']
</lang>