Strip control codes and extended characters from a string
You are encouraged to solve this task according to the task description, using any language you may know.
The task is to strip control codes and extended characters from a string. The solution should demonstrate how to achieve each of the following results:
- a string with control codes stripped (but extended characters not stripped)
- a string with control codes and extended characters stripped
In ASCII, the control codes have decimal codes 0 through to 31 and 127 and greater than 126. On an ASCII based system, if the control codes are stripped, the resultant string would have all of its characters within the range of 32 to 126 decimal on the ascii table.
On a non-ASCII based system, we consider characters that do not have a corresponding glyph on the ASCII table (within the ASCII range of 32 to 126 decimal) to be an extended character for the purpose of this task.
[edit] Ada
with Ada.Text_IO;
procedure Strip_ASCII is
Full: String := 'a' & Character'Val(11) & 'b' & Character'Val(166) &
'c' & Character'Val(127) & Character'Val(203) &
Character'Val(202) & "de";
-- 5 ordinary characters ('a' .. 'e')
-- 2 control characters (11, 127); note that 11 is the "vertical tab"
-- 3 extended characters (166, 203, 202)
function Filter(S: String;
From: Character := ' ';
To: Character := Character'Val(126);
Above: Character := Character'Val(127)) return String is
begin
if S'Length = 0 then
return "";
elsif (S(S'First) >= From and then S(S'First) <= To) or else S(S'First) > Above then
return S(S'First) & Filter(S(S'First+1 .. S'Last), From, To, Above);
else
return Filter(S(S'First+1 .. S'Last), From, To, Above);
end if;
end Filter;
procedure Put_Line(Text, S: String) is
begin
Ada.Text_IO.Put_Line(Text & " """ & S & """, Length:" & Integer'Image(S'Length));
end Put_Line;
begin
Put_Line("The full string :", Full);
Put_Line("No Control Chars:", Filter(Full)); -- default values for From, To, and Above
Put_Line("Neither_Extended:", Filter(Full, Above => Character'Last)); -- defaults for From and To
end Strip_ASCII;
Output:
The full string : "a
b�c��de", Length: 10
No Control Chars: "ab�c��de", Length: 8
Neither_Extended: "abcde", Length: 5
[edit] AutoHotkey
Stripped(x){
Loop Parse, x
if Asc(A_LoopField) > 31 and Asc(A_LoopField) < 128
r .= A_LoopField
return r
}
MsgBox % stripped("`ba" Chr(00) "b`n`rc`fd" Chr(0xc3))
[edit] BASIC
While DOS does support some extended characters, they aren't entirely standardized, and shouldn't be relied upon.
DECLARE FUNCTION strip$ (what AS STRING)
DECLARE FUNCTION strip2$ (what AS STRING)
DIM x AS STRING, y AS STRING, z AS STRING
' tab c+cedilla eof
x = CHR$(9) + "Fran" + CHR$(135) + "ais" + CHR$(26)
y = strip(x)
z = strip2(x)
PRINT "x:"; x
PRINT "y:"; y
PRINT "z:"; z
FUNCTION strip$ (what AS STRING)
DIM outP AS STRING, L0 AS INTEGER, tmp AS STRING
FOR L0 = 1 TO LEN(what)
tmp = MID$(what, L0, 1)
SELECT CASE ASC(tmp)
CASE 32 TO 126
outP = outP + tmp
END SELECT
NEXT
strip$ = outP
END FUNCTION
FUNCTION strip2$ (what AS STRING)
DIM outP AS STRING, L1 AS INTEGER, tmp AS STRING
FOR L1 = 1 TO LEN(what)
tmp = MID$(what, L1, 1)
SELECT CASE ASC(tmp)
'normal accented various greek, math, etc.
CASE 32 TO 126, 128 TO 168, 171 TO 175, 224 TO 253
outP = outP + tmp
END SELECT
NEXT
strip2$ = outP
END FUNCTION
Output:
x: Français→ y:Franais z:Français
See also: Liberty BASIC, PureBasic, Run BASIC
[edit] BBC BASIC
test$ = CHR$(9) + "Fran" + CHR$(231) + "ais." + CHR$(127)
PRINT "Original ISO-8859-1 string: " test$ " (length " ; LEN(test$) ")"
test$ = FNstripcontrol(test$)
PRINT "Control characters stripped: " test$ " (length " ; LEN(test$) ")"
test$ = FNstripextended(test$)
PRINT "Control & extended stripped: " test$ " (length " ; LEN(test$) ")"
END
DEF FNstripcontrol(A$) : REM CHR$(127) is a 'control' code
LOCAL I%
WHILE I%<LEN(A$)
I% += 1
IF ASCMID$(A$,I%)<32 OR ASCMID$(A$,I%)=127 THEN
A$ = LEFT$(A$,I%-1) + MID$(A$,I%+1)
ENDIF
ENDWHILE
= A$
DEF FNstripextended(A$)
LOCAL I%
WHILE I%<LEN(A$)
I% += 1
IF ASCMID$(A$,I%)>127 THEN
A$ = LEFT$(A$,I%-1) + MID$(A$,I%+1)
ENDIF
ENDWHILE
= A$
Output:
Original ISO-8859-1 string: Français (length 11) Control characters stripped: Français. (length 9) Control & extended stripped: Franais. (length 8)
[edit] C
#include <stdio.h>output:
#include <stdlib.h>
#define IS_CTRL (1 << 0)
#define IS_EXT (1 << 1)
#define IS_ALPHA (1 << 2)
#define IS_DIGIT (1 << 3) /* not used, just give you an idea */
unsigned int char_tbl[256] = {0};
/* could use ctypes, but then they pretty much do the same thing */
void init_table()
{
int i;
for (i = 0; i < 32; i++) char_tbl[i] |= IS_CTRL;
char_tbl[127] |= IS_CTRL;
for (i = 'A'; i <= 'Z'; i++) {
char_tbl[i] |= IS_ALPHA;
char_tbl[i + 0x20] |= IS_ALPHA; /* lower case */
}
for (i = 128; i < 256; i++) char_tbl[i] |= IS_EXT;
}
/* depends on what "stripped" means; we do it in place.
* "what" is a combination of the IS_* macros, meaning strip if
* a char IS_ any of them
*/
void strip(char * str, int what)
{
unsigned char *ptr, *s = (void*)str;
ptr = s;
while (*s != '\0') {
if ((char_tbl[(int)*s] & what) == 0)
*(ptr++) = *s;
s++;
}
*ptr = '\0';
}
int main()
{
char a[256];
int i;
init_table();
/* populate string with one of each char */
for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0';
strip(a, IS_CTRL);
printf("%s\n", a);
for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0';
strip(a, IS_CTRL | IS_EXT);
printf("%s\n", a);
for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0';
strip(a, IS_CTRL | IS_EXT | IS_ALPHA);
printf("%s\n", a);
return 0;
}
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ <odd stuff my xterm thinks are bad unicode hence can't be properly shown>
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~
[edit] C++
#include <string>
#include <iostream>
#include <algorithm>
#include <boost/lambda/lambda.hpp>
#include <boost/lambda/casts.hpp>
#include <ctime>
#include <cstdlib>
using namespace boost::lambda ;
struct MyRandomizer {
char operator( )( ) {
return static_cast<char>( rand( ) % 256 ) ;
}
} ;
std::string deleteControls ( std::string startstring ) {
std::string noControls( " " ) ;//creating space for
//the standard algorithm remove_copy_if
std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noControls.begin( ) ,
ll_static_cast<int>( _1 ) < 32 && ll_static_cast<int>( _1 ) == 127 ) ;
return noControls ;
}
std::string deleteExtended( std::string startstring ) {
std::string noExtended ( " " ) ;//same as above
std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noExtended.begin( ) ,
ll_static_cast<int>( _1 ) > 127 || ll_static_cast<int>( _1 ) < 32 ) ;
return noExtended ;
}
int main( ) {
std::string my_extended_string ;
for ( int i = 0 ; i < 40 ; i++ ) //we want the extended string to be 40 characters long
my_extended_string.append( " " ) ;
srand( time( 0 ) ) ;
std::generate_n( my_extended_string.begin( ) , 40 , MyRandomizer( ) ) ;
std::string no_controls( deleteControls( my_extended_string ) ) ;
std::string no_extended ( deleteExtended( my_extended_string ) ) ;
std::cout << "string with all characters: " << my_extended_string << std::endl ;
std::cout << "string without control characters: " << no_controls << std::endl ;
std::cout << "string without extended characters: " << no_extended << std::endl ;
return 0 ;
}
Output:
string with all characters: K�O:~���7�5���� ���W��@>��ȓ�q�Q@���W- string without control characters: K�O:~���7�5���� ���W��@>��ȓ�q�Q@���W- string without extended characters: KO:~75W@>qQ@W-
[edit] D
import std.traits;
S stripChars(S)(S s, bool function(dchar) pure nothrow mustStrip)
pure nothrow if (isSomeString!S) {
S result;
foreach (c; s) {
if (!mustStrip(c))
result ~= c;
}
return result;
}
void main() {
import std.stdio, std.uni;
auto s = "\u0000\u000A abc\u00E9def\u007F";
writeln(s.stripChars( &isControl ));
writeln(s.stripChars( c => isControl(c) || c == '\u007F' ));
writeln(s.stripChars( c => isControl(c) || c >= '\u007F' ));
}
- Output:
abcédef abcédef abcdef
[edit] Forth
: strip ( buf len -- buf len' ) \ repacks buffer, so len' <= len
over + over swap over ( buf dst limit src )
do
i c@ 32 127 within if
i c@ over c! char+
then
loop
over - ;
[edit] Fortran
module stripcharacters
implicit none
contains
pure logical function not_control(ch)
character, intent(in) :: ch
not_control = iachar(ch) >= 32 .and. iachar(ch) /= 127
end function not_control
pure logical function not_extended(ch)
character, intent(in) :: ch
not_extended = iachar(ch) >= 32 .and. iachar(ch) < 127
end function not_extended
pure function strip(string,accept) result(str)
character(len=*), intent(in) :: string
character(len=len(string)) :: str
interface
pure logical function accept(ch)
character, intent(in) :: ch
end function except
end interface
integer :: i,n
str = repeat(' ',len(string))
n = 0
do i=1,len(string)
if ( accept(string(i:i)) ) then
n = n+1
str(n:n) = string(i:i)
end if
end do
end function strip
end module stripcharacters
program test
use stripcharacters
character(len=256) :: string, str
integer :: ascii(256), i
forall (i=0:255) ascii(i) = i
forall (i=1:len(string)) string(i:i) = achar(ascii(i))
write (*,*) string
write (*,*) 'Control characters deleted:'
str = strip(string,not_control)
write (*,*) str
forall (i=1:len(string)) string(i:i) = achar(ascii(i))
write (*,*) 'Extended characters deleted:'
write (*,*) strip(string,not_extended)
end program test
[edit] Go
Go works for ASCII and non-ASCII systems. The first pair of functions below interpret strings as byte strings, presumably useful for strings consisting of ASCII and 8-bit extended ASCII data. The second pair of functions interpret strings as UTF-8.
package main
import (
"fmt"
"strings"
)
// two byte-oriented functions identical except for operator comparing c to 127.
func stripCtlFromBytes(str string) string {
b := make([]byte, len(str))
var bl int
for i := 0; i < len(str); i++ {
c := str[i]
if c >= 32 && c != 127 {
b[bl] = c
bl++
}
}
return string(b[:bl])
}
func stripCtlAndExtFromBytes(str string) string {
b := make([]byte, len(str))
var bl int
for i := 0; i < len(str); i++ {
c := str[i]
if c >= 32 && c < 127 {
b[bl] = c
bl++
}
}
return string(b[:bl])
}
// two UTF-8 functions identical except for operator comparing c to 127
func stripCtlFromUTF8(str string) string {
return strings.Map(func(r rune) rune {
if r >= 32 && r != 127 {
return r
}
return -1
}, str)
}
func stripCtlAndExtFromUTF8(str string) string {
return strings.Map(func(r rune) rune {
if r >= 32 && r < 127 {
return r
}
return -1
}, str)
}
const src = "déjà vu" + // precomposed unicode
"\n\000\037 \041\176\177\200\377\n" + // various boundary cases
"as⃝df̅" // unicode combining characters
func main() {
fmt.Println("source text:")
fmt.Println(src, "\n")
fmt.Println("as bytes, stripped of control codes:")
fmt.Println(stripCtlFromBytes(src), "\n")
fmt.Println("as bytes, stripped of control codes and extended characters:")
fmt.Println(stripCtlAndExtFromBytes(src), "\n")
fmt.Println("as UTF-8, stripped of control codes:")
fmt.Println(stripCtlFromUTF8(src), "\n")
fmt.Println("as UTF-8, stripped of control codes and extended characters:")
fmt.Println(stripCtlAndExtFromUTF8(src))
}
Output: (varies with display configuration)
source text: déjà vu � !~�� as⃝df̅ as bytes, stripped of control codes: déjà vu !~��as⃝df̅ as bytes, stripped of control codes and extended characters: dj vu !~asdf as UTF-8, stripped of control codes: déjà vu !~��as⃝df̅ as UTF-8, stripped of control codes and extended characters: dj vu !~asdf
[edit] Haskell
import Data.Char
strip = filter (\x -> ord x > 32 && ord x < 126)
[edit] Icon and Unicon
We'll use deletec to remove unwanted characters (2nd argument) from a string (1st argument). The procedure below coerces types back and forth between string and cset. The character set of unwanted characters is the difference of all ASCII characters and the ASCII characters from 33 to 126.
procedure main(A)
write(image(deletec(&ascii,&ascii--(&ascii)[33:127])))
end
link strings
The IPL procedure deletec is equivalent to this:
procedure deletec(s, c) #: delete characters
result := ""
s ? {
while result ||:= tab(upto(c)) do tab(many(c))
return result ||:= tab(0)
}
end
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}"
[edit] J
Solution:
stripControlCodes=: -.&(DEL,32{.a.)
stripControlExtCodes=: ([ -. -.)&(32}.127{.a.)
Usage:
mystring=: a. {~ ?~256 NB. ascii chars 0-255 in random order
#mystring NB. length of string
256
#stripControlCodes mystring NB. length of string without control codes
223
#stripControlExtCodes mystring NB. length of string without control codes or extended chars
95
#myunicodestring=: u: ?~1000 NB. unicode characters 0-999 in random order
1000
#stripControlCodes myunicodestring
967
#stripControlExtCodes myunicodestring
95
stripControlExtCodes myunicodestring
k}w:]U3xEh9"GZdr/#^B.Sn%\uFOo[(`t2-J6*IA=Vf&N;lQ8,${XLz5?D0~s)'Y7Kq|ip4<WRCaM!b@cgv_T +mH>1ejPy
[edit] Liberty BASIC
all$ =""
for i =0 to 255
all$ =all$ +chr$( i)
next i
print "Original string of bytes. ( chr$( 10) causes a CRLF.)"
print all$
lessControl$ =controlStripped$( all$)
print "With control codes stripped out."
print lessControl$
lessExtendedAndControl$ =extendedStripped$( lessControl$)
print "With extended codes stripped out too."
print lessExtendedAndControl$
end
function controlStripped$( i$)
r$ =""
for j =1 to len( i$)
ch$ =mid$( i$, j, 1)
if asc( ch$) >=32 then r$ =r$ +ch$
next j
controlStripped$ =r$
end function
function extendedStripped$( i$)
r$ =""
for j =1 to len( i$)
ch$ =mid$( i$, j, 1)
if asc( ch$) <=128 then r$ =r$ +ch$
next j
extendedStripped$ =r$
end function
[edit] Lua
function Strip_Control_Codes( str )
local s = ""
for i in str:gmatch( "%C+" ) do
s = s .. i
end
return s
end
function Strip_Control_and_Extended_Codes( str )
local s = ""
for i = 1, str:len() do
if str:byte(i) >= 32 and str:byte(i) <= 126 then
s = s .. str:sub(i,i)
end
end
return s
end
q = ""
for i = 0, 255 do
q = q .. string.char(i)
end
print( Strip_Control_Codes(q) )
print( Strip_Control_and_Extended_Codes(q) )
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
[edit] Mathematica
stripCtrl[x_]:=StringJoin[Select[Characters[x],
MemberQ[CharacterRange["!","~"]~Join~Characters[FromCharacterCode[Range[128,255]]],#]&]]
stripCtrlExt[x_]:=StringJoin[Select[Characters[x],
MemberQ[CharacterRange["!","~"],#]&]]
Test:
CompleteSet=FromCharacterCode[Range[0,255]]
->\.00\.02\.03\.04\.05\.06\.07\.08\.0b\.0e\.0f\.10\.11\.12\.13\.14
\.15\.16\.17\.18\.19\.1a\[RawEscape]\.1c\.1d\.1e\.1f !"#$%&'()*+,-./
0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
^_`abcdefghijklmnopqrstuvwxyz{|}~
¡¢£¤¥¦§¨©ª«\[Not]®¯\[Degree]
\[PlusMinus]\.b2\.b3\.b4\[Micro]\[Paragraph]\[CenterDot]¸¹º»¼½¾¿
ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ*ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö/øùúûüýþÿ
stripCtrl[CompleteSet]
->!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
^_`abcdefghijklmnopqrstuvwxyz{|}~
¡¢£¤¥¦§¨©ª«\[Not]®¯\[Degree]
\[PlusMinus]\.b2\.b3\.b4\[Micro]\[Paragraph]\[CenterDot]
¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ*ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö
/øùúûüýþÿ
stripCtrlExt[CompleteSet]
->!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
^_`abcdefghijklmnopqrstuvwxyz{|}~
[edit] MATLAB / Octave
function str = stripped(str)
str = str(31<str & str<127);
end;
[edit] OCaml
let is_control_code c =
let d = int_of_char c in
d < 32 || d = 127
let is_extended_char c =
let d = int_of_char c in
d > 127
let strip f str =
let len = String.length str in
let res = String.create len in
let rec aux i j =
if i >= len then String.sub res 0 j else
if f str.[i]
then aux (succ i) j
else begin
res.[j] <- str.[i];
aux (succ i) (succ j)
end
in
aux 0 0
let () =
let len = 32 in
let s = String.create len in
Random.self_init();
for i = 0 to pred len do
s.[i] <- char_of_int (Random.int 256)
done;
print_endline (strip is_control_code s);
print_endline (strip (fun c -> (is_control_code c) || (is_extended_char c)) s);
;;
[edit] Pascal
program StripCharacters(output);
function Strip (s: string; control, extended: boolean): string;
var
index: integer;
begin
Strip := '';
for index:= 1 to length(s) do
if not ((control and (ord(s[index]) <= 32)) or (extended and (ord(s[index]) > 127))) then
Strip := Strip + s[index];
end;
var
test: string;
i: integer;
begin
setlength(test, 40);
randomize;
for i := 1 to length(test) do
test[i] := char(1 + random(255));
writeln ('Original: ', test);
writeln ('No CNTL: ', Strip(test, true, false));
writeln ('No extnd: ', Strip(test, false, true));
writeln ('ASCII: ', Strip(test, true, true));
end.
Output:
% ./StripCharacters
Original: )?z8i9?a?K??N?s?F˪w?a??s
#?b?B}PT?ٜ
No CNTL: )?z8i9?a?K??N?s?F˪w?a??s#?b?B}PT?ٜ
No extnd: )z8i9aKNsFwas
#bB}PT
ASCII: )z8i9aKNsFwas#bB}PT
[edit] Perl
#!/usr/bin/perl -w
use strict ;
my @letters ;
my @nocontrols ;
my @noextended ;
for ( 1..40 ) {
push @letters , int( rand( 256 ) ) ;
}
print "before sanitation : " ;
print join( '' , map { chr( $_ ) } @letters ) ;
print "\n" ;
@nocontrols = grep { $_ > 32 && $_ != 127 } @letters ;
print "Without controls: " ;
print join( '' , map { chr( $_ ) } @nocontrols ) ;
@noextended = grep { $_ < 127 } @nocontrols ;
print "\nWithout extended: " ;
print join( '' , map { chr( $_ ) } @noextended ) ;
print "\n" ;
Output:
before sanitation : �L08&YH�O��n)�:���O�G$���.���"zO���Q�?�� Without controls: �L08&YH�O��n)�:�O�G$���.���"zO��Q�?�� Without extended: L08&YHOn):OG$."zOQ?
[edit] Perl 6
my $str = (0..400).roll(80)».chr.join;
say $str;
say $str.subst(/<[ ^@..^_ ]>/, '', :g);
say $str.subst(/<-[ \ ..~ ]>/, '', :g);
�¶ØèúđkƌĘ�r=êıƏÄÙÍy1SGa%TÑ�ęMRŅ�EŧİÌŬńĩµ9ŒďĔÜÉĈĬzijdś5FúŨƏźƅíýÛÃņGÏ
ö~ƀRÑú
¶ØèúđkƌĘr=êıƏÄÙÍy1SGa%TÑęMRŅEŧİÌŬńĩµ9ŒďĔÜÉĈĬzijdś5FúŨƏźƅíýÛÃņGÏö~ƀRÑú
kr=y1SGa%TMRE9zd5FG~R
[edit] PicoLisp
Control characters in strings are written with a hat (^) in PicoLisp. ^? is the DEL character.
(de stripCtrl (Str)
(pack
(filter
'((C)
(nor (= "^?" C) (> " " C "^A")) )
(chop Str) ) ) )
(de stripCtrlExt (Str)
(pack
(filter
'((C) (> "^?" C "^_"))
(chop Str) ) ) )
Test:
: (char "^?") -> 127 : (char "^_") -> 31 : (stripCtrl "^I^M a b c^? d äöüß") -> " a b c d äöüß" : (stripCtrlExt "^I^M a b c^? d äöüß") -> " a b c d "
[edit] Pike
> string input = random_string(100);
> (string)((array)input-enumerate(32)-enumerate(255-126,1,127));
Result: "p_xx08M]cK<FHgR3\\I.x>)Tm<VgakYddy&P7"
[edit] PL/I
stripper: proc options (main);
declare s character (100) varying;
declare i fixed binary;
s = 'the quick brown fox jumped';
/* A loop to replace blanks with control characters */
do i = 1 to length(s);
if substr(s, i, 1) = ' ' then
substr(s, i, 1) = '01'x;
end;
put skip list (s);
call stripcc (s);
put skip list (s);
s = 'now is the time for all good men';
/* A loop to replace blanks with control characters */
do i = 1 to length(s);
if substr(s, i, 1) = ' ' then
substr(s, i, 1) = 'A1'x;
end;
put skip list (s);
call stripex (s);
put skip list (s);
/* Strip control codes. */
stripcc: procedure (s);
declare s character (*) varying;
declare w character (length(s));
declare c character (1);
declare (i, j) fixed binary;
j = 0;
do i = 1 to length (s);
c = substr(s, i, 1);
if unspec(c) >= '00100000'b | unspec(c) = '01111111'b then
do;
j = j + 1;
substr(w, j, 1) = c;
end;
end;
s = substr(w, 1, j);
end stripcc;
/* Strips control codes and extended characters. */
stripex: procedure (s);
declare s character (*) varying;
declare w character (length(s));
declare c character (1);
declare (i, j) fixed binary;
j = 0;
do i = 1 to length (s);
c = substr(s, i, 1);
if unspec(c) >= '00100000'b & unspec(c) < '01111111'b then
do;
j = j + 1;
substr(w, j, 1) = c;
end;
end;
s = substr(w, 1, j);
end stripex;
end stripper;
Output:
the�quick�brown�fox�jumped thequickbrownfoxjumped now¡is¡the¡time¡for¡all¡good¡men nowisthetimeforallgoodmen
[edit] Protium
Protium has a native instruction for removing control codes from a string, SAL, the Low ASCII Strip. From the manual:
Create variable with control characters: <@ SAYLETVARLIT>i|This string has control characters
- - - - - -
in it</@>
Strip control characters <@ SAYSALVAR>i</@>
Assign infix <@ LETVARSALVAR>j|i</@> <@ SAYVAR>j</@>
Assign prepend <@ LETSALVARVAR>k|i</@> <@ SAYVAR>k</@>
Reflexive assign <@ ACTSALVAR>i</@> <@ SAYVAR>i</@>
Protium also has SAH, High ASCII Strip. Again, from the manual:
Create variable with high and low ANSI: <@ SAYLETVARLIT>i|This string has both low ansi and high ansi characters - il doit d'être prévenu</@>
Strip high ANSI <@ SAYSAHVAR>i</@>
Assign infix <@ LETVARSAHVAR>j|i</@> <@ SAYVAR>j</@>
Assign prepend <@ LETSAHVARVAR>k|i</@> <@ SAYVAR>k</@>
Reflexive assign <@ ACTSAHVAR>i</@> <@ SAYVAR>i</@>
[edit] PureBasic
Procedure.s stripControlCodes(source.s)
Protected i, *ptrChar.Character, length = Len(source), result.s
*ptrChar = @source
For i = 1 To length
If *ptrChar\c > 31
result + Chr(*ptrChar\c)
EndIf
*ptrChar + SizeOf(Character)
Next
ProcedureReturn result
EndProcedure
Procedure.s stripControlExtCodes(source.s)
Protected i, *ptrChar.Character, length = Len(source), result.s
*ptrChar = @source
For i = 1 To length
If *ptrChar\c > 31 And *ptrChar\c < 128
result + Chr(*ptrChar\c)
EndIf
*ptrChar + SizeOf(Character)
Next
ProcedureReturn result
EndProcedure
If OpenConsole()
;create sample string
Define i, s.s
For i = 1 To 80
s + Chr(Random(254) + 1) ;include character values from 1 to 255
Next
PrintN(stripControlCodes(s)) ;string without control codes
PrintN("---------")
PrintN(stripControlExtCodes(s)) ;string without control codes or extended chars
Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
CloseConsole()
EndIf
Sample output:
»╫=┐C─≡G(═ç╤â√╝÷╔¬ÿ▌x è4∞|)ï└⌐ƒ9²òτ┌ºáj)▓<~-vPÿφQ╨ù¿╖îFh"[ü╗dÉ₧q#óé├p╫■ --------- =CG(x 4|)9j)<~-vPQFh"[dq#p
[edit] Python
def stripped(x):Output:
return "".join([i for i in x if ord(i) in range(32, 127)])
print stripped("\ba\x00b\n\rc\fd\xc3")
abcd
[edit] Racket
#lang racket
;; Works on both strings (Unicode) and byte strings (raw/ASCII)
(define (strip-controls str)
(regexp-replace* #rx"[\0-\037\177]+" str ""))
(define (strip-controls-and-extended str)
(regexp-replace* #rx"[^\040-\176]+" str ""))
[edit] REXX
[edit] version 1
(Both versions support ASCII and EBCDIC.)
This version is much faster, but it's much harder to understand what's happening.
/*REXX program to strip all "control codes" from a string (ASCII|EBCDIC)*/
xxx='string of ☺☻♥♦⌂, may include control characters and other ilk.♫☼§►↔◄'
/*in EBCDIC, digit 1 is 'f1'x,*/
/*in ASCII, digit 1 is '31'x.*/
ebcdic= 1=='f1'x /*is this an EBCDIC computer?*/
/*generate a string of chars from*/
/*'00'x ──► [1 just before blank]*/
ccChars = xrange(,d2c(c2d(' ') -1)) /*generate a range of characters.*/
if \ebcdic then ccChars=ccChars'7f'x /*add the ASCII '7f'X char. */
say 'hex ccChars =' c2x(ccChars) /*might as well do a show & tell.*/
ccCharsX = ccChars'ff'x /*add a "stop" char for ccCharsX.*/
/*══════════════════════════════════════════════════════════════════════*/
_stop = substr(ccCharsX, verify(ccCharsX, xxx), 1) /*find a stop char.*/
yyy = translate(space(translate(xxx, _stop, " "ccChars), 0), , _stop)
/*══════════════════════════════════════════════════════════════════════*/
say 'old = >>>'xxx"<<<" /*add fence before&after old text*/
say 'new = >>>'yyy"<<<" /* " " " " new text*/
/*stick a fork in it, we're done.*/
output
hex ccChars = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F7F old = >>>string of ☺☻♥♦⌂, may include control characters and other ilk.♫☼§►↔◄<<< new = >>>string of , may include control characters and other ilk.<<<
[edit] version 2
(Both versions support ASCII and EBCDIC.)
A slower version, but it's much easier to understand the process.
/*REXX program to strip all "control codes" from a string (ASCII|EBCDIC)*/
xxx='string of ☺☻♥♦⌂, may include control characters and other ilk.♫☼§►↔◄'
/*in EBCDIC, digit 1 is 'f1'x,*/
/*in ASCII, digit 1 is '31'x.*/
ascii= '31'x==1 /*is this an ASCII computer? */
/* (if you are ASCII-centric.) */
/*generate a string of chars from*/
/*'00'x --> [1 just before blank]*/
ccChars=xrange(, d2c(c2d(' ') -1)) /*generate a range of characters.*/
if ascii then ccChars = ccChars'7f'x /*add the ASCII '7f'X char. */
say 'hex ccChars =' c2x(ccChars) /*might as well do a show & tell.*/
/*══════════════════════════════════════════════════════════════════════*/
yyy='' /*start with a clean slate. */
do j=1 for length(xxx) /*build new str, 1 byte at a time*/
_ = substr(xxx,j,1) /*get next char in the old string*/
if pos(_,ccChars)\==0 then iterate /*skip this char, it's a no-no. */
yyy = yyy || _ /*we found a good & decent fellow*/
end
/*══════════════════════════════════════════════════════════════════════*/
say 'old = >>>'xxx"<<<" /*add fence before&after old text*/
say 'new = >>>'yyy"<<<" /* " " " " new text*/
/*stick a fork in it, we're done.*/
output is identical to version 1.
[edit] Ruby
class String
def strip_control_characters()
self.chars.inject("") do |str, char|
unless char.ascii_only? and (char.ord < 32 or char.ord == 127)
str << char
end
str
end
end
def strip_control_and_extended_characters()
self.chars.inject("") do |str, char|
if char.ascii_only? and char.ord.between?(32,126)
str << char
end
str
end
end
end
p s = "\ba\x00b\n\rc\fd\xc3\x7ffoo"
p s.strip_control_characters
p s.strip_control_and_extended_characters
outputs
"\ba\u0000b\n\rc\fd\xC3\u007Ffoo" "abcd\xC3foo" "abcdfoo"
[edit] Run BASIC
s$ = chr$(31) + "abc" + chr$(13) + "def" + chr$(11) + "ghi" + chr$(10)
print strip$(s$)
' -----------------------------------------
' strip junk
' -----------------------------------------
FUNCTION strip$(str$)
for i = 1 to len(str$)
a$ = MID$(str$,i,1)
a = ASC(a$)
if a > 31 then
if a < 123 then
if a$ <> "'" then
if a$ <> """" then
strip$ = strip$ + a$
end if
end if
end if
end if
next i
END FUNCTION
input : chr$(31)+"abc"+chr$(13)+"def"+chr$(11)+"ghi"+chr$(10) output : abcdefghi
[edit] Scala
[edit] ASCII: Using StringOps Class
val controlCode : (Char) => Boolean = (c:Char) => (c <= 32 || c == 127)
val extendedCode : (Char) => Boolean = (c:Char) => (c <= 32 || c > 127)
// ASCII test...
val teststring = scala.util.Random.shuffle( (1.toChar to 254.toChar).toList ).mkString
println( "ctrl filtered out: \n\n" +
teststring.filterNot(controlCode) + "\n" )
println( "ctrl and extended filtered out: \n\n" +
teststring.filterNot(controlCode).filterNot(extendedCode) + "\n" )
- Output:
ctrl filtered out:
?d2??6ú╖)ⁿ┼gEhW3RS⌠!a?┬╘├╢-ß?·▄╔B,_?╟│┤'C║j«?ΩcqJ╣²▀÷±?0s∩░uτ8Φ½&Σ¬y▓H?*?AL?═eDX??≥╚╧
4σ+r=Ѽ╙U▌"?.⌐?≡K?k╥áF\?╕QΘ?╪Z?▐√╠?`M?7▒°G^≈@xz?>t:╦╨íw¿─┐]Io(V╡?P¡┴?º┌ΓO┘φ└╓~|#⌡π?}µ
╗l???$ó{n/╫mi╤<9f≤?∙»Nª;?1??εT?■╩%╒╛[╜p∞α╬vñ╞bYδ╝5█
ctrl and extended filtered out:
?d26)gEhW3RS!a-B,_'CjcqJ0su8&yH*ALeDX4+r=U".KkF
\QZ`M7G^@xz>t:w]Io(VPO~|#}l${n/mi<9fN;1T%[pvbY5
[edit] Unicode: Using Regular Expressions
//
// A Unicode test string
//
val ulist = 0x8232.toChar :: 0xFFF9.toChar :: 0x200E.toChar :: (1.toChar to 2000.toChar).toList
val ustring = scala.util.Random.shuffle( ulist ).mkString
// Remove control codes including private codes
val sNoCtrlCode = ustring.replaceAll("[\\p{C}]","")
val htmlNoCtrlCode = for( i <- sNoCtrlCode.indices ) yield
"&#" + sNoCtrlCode(i).toInt + ";" + (if( (i+1) % 10 == 0 ) "\n" else "")
println( "ctrl filtered out: <br/><br/>\n\n" + htmlNoCtrlCode.mkString + "<br/><br/>\n" )
// Keep 0x00-0x7f and remove control codes
val sNoExtCode = ustring.replaceAll("[^\\p{InBasicLatin}]","").replaceAll("[\\p{C}]","")
val htmlNoExtCode = for( i <- sNoExtCode.indices ) yield
"&#" + sNoExtCode(i).toInt + ";" + (if( (i+1) % 10 == 0 ) "\n" else "")
println( "ctrl and extended filtered out: <br/><br/>\n\n" + htmlNoExtCode.mkString + "<br/><br/>\n" )
- Output:
ctrl filtered out:
Қ۳ҹؽݠČǗ ɄخȿѻȺ·˖ҎDװѩԸץ˶٤ǞĽۭԑϣʱƁΫیոϐۛؕƟͱӄצ
ݞÄҺϤΪq͆ݶȲֆ֫՞ѡ٘ٺ݅ݦϠ۸ܞЕ²ɋډҒӂȏϘʚΧ˾ܡȳևՋƖדȇހڷ
ӣ؊ՙ̨۟ۀĦ¬ς݇؛ĿΟՕƜ1ԵۣݬͮDŽ̪عڔЊڳœӟѺ՜ʬΚ݀͗̾ʊáLJѪ
ٹȭםޮɅ֜ʼݥޤڎۊŭЁ˹īʶñۼ[ӽه:3ŜSԛģó͑؝Јי̅ݏˬˮіۍ
PԅMѯԶɔܺ֘ݷԾڽˎΞn̏ŗۧ˒ޥϷDžŀ˳Ѡ̮ݕũۃɹƾފѼܚٍڢŽܛńի
߀CžʾʹքӭξҖҢǘZ<ɼÐՅ̔ݧئ٫ʖѬ̩ߌǿɇݻ÷ђΤǀёՓJѦϻΥ¨̦ڥ
ˍ͖٭зݡѧס̶̵¯ȐƌޭԮϙҦ+ڦÇ֍ɽƥЦ۩ǦҠêʧɮð̣̯ޘƭƠͺޠݩγ
ѳ֊НۜÚӺϭ̃ݵƬŇЎٛԪճɟЩЋֱϜޒfޓʨܮɱղŬέǏu։Ҹ˅رɾʘͅ
߃¾ª"ɖȾύƔýՈԉգȨڼۮ͍õŘ]àߊܱЫҴٗԞۢ ˂ہƞɺƚ۞ҤڻƸǵǯ
܄ʠƲťïܠmТϋɥΈώͶ٢ؖټ\Ŵˣé֔ڤÿϛσ߂Λøʞӿăմ؉ϪЗպƓ҆܀ݰ
ѹΏɨғڲƀɞԗ26Ĕŵ³ѥ͛ǨՄǓĻڊʝӹʍ̠ϰʄՍə܅лġޏnj˴ʃ۵Ϻdž
՟Ҽ̈۹ԺߎМރѵ͋ؾЇѐ|ԨةݝɕǕڴljȩþؐ؆j˿ق۽ɳОѷęijٯÂěȴ
˓חګ٠ޅךćءɉӢɗчЉő͜Ѐ֨ےϸܽԀ»эјۇޣ֤Ґ͊ȫއɒۓ۷́ضӖΙَБ
×ʳҧЅȗUƅŷÈɸȕ܆ĵޖȁɢӼԒѢխǖ@ǡȰўҍГӛº۾ף˔ъͣʏ«ћڏ
حǟΖԕƑǼթٸe֦ʗސڍƙКԻ̿ιü̧´փϱʤܭηƷѨЮԭѿ¢ßݚؒ٪ҩʢں`
۠μưɍӉØN̬ވƘȈ֎ǝٙӷΨи¥еοݫ̷Ɨ?ͰĹ֝ݸ_؇Àׁ̌ûʥĭȶǍԔ
үơ̕ג˗ϑӤӰ'άܶѮĀشڱӻǻܼƿ܉ͪѰܑ̓K~Ұ֮πҬʷ˫ۯٿpВ̥
տؓЍܒ˲ӥɰާڜԫ͎ϿűӈϦƤڣʓƎѱޛяâޱƻЄÃ̸ـ̂֕Ӛ̗֣ͬӦ݄ۑƦܖ
߄ɷۉڨȑձІȖƯٵۡуħۦِ̱ԢɑФ١̺دʂsչ.ۋҾėè˭ٷbϮݱݔǩʈ
ȻiͫϯԦĚػţ܍եֿχ݂ت܁լɀгޙȥʼnˈՇݣ̰ӊɩɲȢәNJȬ֙׀ݼŏكЭ
ʒױ̼љфȃHՖ؟҂̢ʕʲƼްӀŁڄ˼ȅԙÙ7ّſۘ٩ȱ֟ƕ͕ʆŚͼօ°؈>ݺ
...
ctrl and extended filtered out:
Dq1[:3SPMnCZ<J+fu"]m\26|jU@e`N?_'K~ps.biH7>zAXVF5
=OgBhYGc-4)E/*a,%wTLoR&W{kd}8l^;0#(!trvIx$Qy9
[edit] Seed7
Seed7 strings are UTF-32 encoded, therefore no destinction between BYTE and Unicode strings is necessary. The example below uses STD_UTF8_OUT from the library utf8.s7i, to write Unicode characters with UTF-8 encoding to the console.
$ include "seed7_05.s7i";
include "utf8.s7i";
const func string: stripControl (in string: stri) is func
result
var string: stripped is "";
local
var integer: old_pos is 1;
var integer: index is 0;
var char: ch is ' ';
begin
for ch key index range stri do
if ch < ' ' or ch = '\127\' then
stripped &:= stri[old_pos .. pred(index)];
old_pos := succ(index);
end if;
end for;
stripped &:= stri[old_pos ..];
end func;
const func string: stripControlAndExtended (in string: stri) is func
result
var string: stripped is "";
local
var integer: old_pos is 1;
var integer: index is 0;
var char: ch is ' ';
begin
for ch key index range stri do
if ch < ' ' or ch >= '\127\' then
stripped &:= stri[old_pos .. pred(index)];
old_pos := succ(index);
end if;
end for;
stripped &:= stri[old_pos ..];
end func;
const string: src is "déjà vu\ # Unicode
\\n\0\\31\ \33\\126\\127\\128\\255\\n\ # Various boundary cases
\as⃝df̅"; # Unicode combining characters
const proc: main is func
begin
OUT := STD_UTF8_OUT;
writeln("source text:");
writeln(src);
writeln("Stripped of control codes:");
writeln(stripControl(src));
writeln("Stripped of control codes and extended characters:");
writeln(stripControlAndExtended(src));
end func;
Output:
source text: déjà vu � !~ÿ as⃝df̅ Stripped of control codes: déjà vu !~ÿas⃝df̅ Stripped of control codes and extended characters: dj vu !~asdf
[edit] Tcl
proc stripAsciiCC str {
regsub -all {[\u0000-\u001f\u007f]+} $str ""
}
proc stripCC str {
regsub -all {[^\u0020-\u007e]+} $str ""
}
[edit] TI-83 BASIC
TI-83 BASIC doesn't support ASCII or Unicode, so the following program just strips every character that doesn't have a corresponding glyph from 32 to 126 decimal in a real ASCII table.
The following "normal characters" do exist, but can't be typed on the calculator and a hex editor must be used to enter them:
#$&@;_`abcdefghijklmnopqrstuvwxyz|~
The double quote character (ASCII decimal 34) can be entered, but cannot be escaped and thus cannot be stored to strings without the use of hex editors. The following program will remove double quotes from the input string if they were hacked in simply because having one stored to the "check" string is syntactically invalid.
So, in sum, you have to hack the calculator to enter in this program, but once it's entered you can transfer it to unhacked calculators and it will work.
:" !#$%&'()*+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"→Str0
:Input ">",Str1
:":"+Str1+":"→Str1
:For(I,2,length(Str1)-2)
:If not(inString(Str0,sub(Str1,I,1)))
:sub(Str1,1,I-1)+sub(Str1,I+1,length(Str1)-(I+1))→Str1
:End
:sub(Str1,2,length(Str1)-1)→Str1
:Pause Str1
[edit] XPL0
include c:\cxpl\codes; \intrinsic 'code' declarations
string 0; \use zero-terminated string convention
proc Strip(Str, Both); \Strip out control and optionally extended chars
char Str; int Both;
int I, J, C;
[I:= 0;
while Str(I) do
[C:= Str(I);
if Both then C:= extend(C); \if stripping extended chars too, extend sign
if C<$20 or C=$7F then
[J:= I; \eliminate char by shifting string down over it
repeat C:= Str(J+1);
Str(J):= C;
J:= J+1;
until C=0;
]
else I:= I+1;
];
];
char String;
[String:= "Hello^M^J World àáâã";
Text(0, String); CrLf(0);
Strip(String, false);
Text(0, String); CrLf(0);
Strip(String, true);
Text(0, String); CrLf(0);
]
Output:
Hello World àáâã Hello World àáâã Hello World
- Programming Tasks
- Solutions by Programming Task
- Ada
- AutoHotkey
- BASIC
- BBC BASIC
- C
- C++
- D
- Forth
- Fortran
- Go
- Haskell
- Icon
- Unicon
- Icon Programming Library
- J
- String manipulation
- Liberty BASIC
- Lua
- Mathematica
- MATLAB
- Octave
- OCaml
- Pascal
- Perl
- Perl 6
- PicoLisp
- Pike
- PL/I
- Protium
- PureBasic
- Python
- Racket
- REXX
- Ruby
- Run BASIC
- Scala
- Seed7
- Tcl
- TI-83 BASIC
- XPL0
- GUISS/Omit
- Openscad/Omit