( grep.tal
by aleteoryx, all rights released
an implementation of the bezza-yates-gonet "shift-or" bitap
search algorithm, for fixed search patterns up to 16 bytes )
%dbg { #010e DEO }
%kil { #010f DEO BRK }
%p2brk { BRK } ( replace this with POP2r BRK when debugging )
( --- initialization --- )
|100
@on-reset ( -> )
;meta #06 DEO2
;args/checkdash #10 DEO2
#0017 DEI EQU ?usage ( die if no args )
;lut
( >> )
@fill-lut ( ptr* -> )
#ffff OVR2 STA2 ( write ffff to LUT )
INC2 INC2 ( offset )
DUP2 ;lut/end NEQ2 ?fill-lut ( loop if not at end )
POP2
BRK
@meta
"grep.rom 0a
"aleteoryx, 20 "all 20 "rights 20 "released 0a0a
"bezza-yates-gonet 20 ""bitap" 20 "text 20 "search 0a00
@args/checkdash ( -> )
[ LIT2 "- 12 ] DEI NEQ ?{
;args/optstr #10 DEO2 BRK
}
;args/lastarg #10 DEO2
( >> )
@args/lastarg ( -> )
#17 DEI
#02 DUP2 NEQ ?{ POP2 !lut/in } ( argument byte, parse )
INC EQU ?usage ( only accept one string arg )
( must be 04, end of args )
( we need to shift state up 10 - lut/len bits before orring it
with match, this is where that's calculated. )
;lut/len LDA2 SUB ( the high byte will be 00 )
;bitap/sft STA
;bitap/in #10 DEO2
[ LITr 80 ] #ffff ( prep the stack for bitap/in )
BRK
@args/optstr ( -> )
#17 DEI
#03 DUP2 NEQ ?{ ( end of -args? )
POP2 ;args/lastarg #10 DEO2 BRK
}
INC DUP2 EQU ?usage ( we can't be the only arg )
POP2
#12
DEIk LIT "F NEQ ?{ POP BRK } ( [F]ixed mode -- do nothing )
DEIk LIT "- NEQ ?{ POP BRK } ( for '--' arg )
DEIk LIT "P NEQ ?{ POP ( [P]attern mode )
;lut/in-pattern ;lut/parser-jmp STA2
BRK }
DEIk LIT "o NEQ ?{ POP ( [o]nly print matches )
#01 ;bitap/only-flag STA
BRK }
DEI ,unknopt/opt STR !unknopt
( --- printing --- )
@usage/str "usage: 20 "<program> 20 "| 20
"grep.rom 20 "[-FPo 20 "| 20 "--] 20 "PATTERN 0a
@usage ( -- )
#01 ;/str !die
@unknopt/str "fatal: 20 "unknown 20 "option 20 "- &opt $1 0a
@unknopt ( -- )
#01 ;/str !die
@toolong/str "fatal: 20 "pattern 20 "must 20 "be 20 "<16 20 "bytes. 0a
@toolong ( -- )
#02 ;/str ( >> )
@die ( code str* -- )
&loop
LDAk STHk #19 DEO ( get and write char )
INC2 ( increment str* )
STHr #0a NEQ ?/loop ( loop if char != 0a )
POP2 #0f DEO BRK ( exit with status XX )
( --- check for a match --- )
@bitap/in ( state* `match -> state* `match )
( handle the line buffer )
#0a12 DEI DUP ( get char, save a byte later )
[ LIT2 &ptr =/line ] ( line pointer )
STH2k STA STH2r ( save byte in buffer )
INC2 ,/ptr STR2 ( save new pointer )
EQU ?/newline ( reset state on newline )
( shift-or )
#10 SFT2 ( SHIFT in a 0 )
lut/get-addr LDA2 ORA2 ( OR with LUT value )
( matches are stored in the high bit of the top byte of the
return stack. if the high bit is low [i.e. we got a match],
this will set match to 00, otherwise it will stay 80. )
DUP2 [ LIT &sft $1 ] SFT2
POP STH ANDr
[ LIT &only-flag 00 ] ?/do-only
BRK
@bitap/newline ( state* `match -> state* `match )
;/line ,/ptr STR2 ( reset line ptr )
POP2 #ffff ( reset state )
STHr [ LITr 80 ] ( reset match )
,/only-flag LDR ORA ( only mode? )
?{ ( we saw a match, print the line )
;/line
&loop
LDAk STHk #18 DEO ( get and write char )
INC2 ( increment str* )
STHr #0a NEQ ?/loop ( loop if char != 0a )
POP2
} BRK
@bitap/do-only ( `match -- `match )
STHrk ?{
POPr [ LITr 80 ] ( reset flag )
,/ptr LDR2 DUP2 ( offset into line )
;lut/len LDA2 ( get length to print )
#04 SFT ( get actual length )
SUB2 SWP2 ( start* end* )
#0a ROT ROT STA ( write a newline )
&only-loop
LDAk STHk #18 DEO
INC2
STHr #0a NEQ ?/only-loop
POP2
} BRK
( --- look-up table generation & pattern parsing --- )
@lut/in-fixed ( -- )
/get-addr LDA2k ( get current mask )
/get-mask AND2 ( compute new value for slot )
SWP2 STA2 ( write new mask )
JMP2r
@lut/col-negate ( -- )
/get-mask #ffff EOR2
[ LIT2r =lut/end LIT2r =lut ] ( mask* `end* `addr* )
&nc-loop
DUP2 STH2rk LDA2 EOR2 ( calculate new mask )
STH2rk STA2 ( save it )
INC2r INC2r ( inc. addr )
NEQ2rk STHr ?/nc-loop ( loop )
POP2r POP2r POP2
JMP2r
@lut/in-class1 ( -- )
;/in-class2 ,/parser-jmp STR2 ( only do this once )
LIT2 "^ 12 DEI NEQ ?{ ( negation op )
[ LIT2 00 _/negate-class ] STR ( enable negation )
p2brk }
( >> )
@lut/in-class2 ( -- )
;/in-class ,/parser-jmp STR2 ( only do this once )
LIT2 "- 12 DEI NEQ ?{ ( leading dash )
/in-fixed
p2brk }
( >> )
@lut/in-class ( -- )
#12
DEIk LIT "] NEQ ?{ POP ( end of class )
[ LIT &negate-class 01 ] ?{
[ LIT2 01 _/negate-class ] STR
/col-negate
}
;/in-pattern ,/parser-jmp STR2
JMP2r }
DEIk LIT "- NEQ ?{ POP ( range section )
;/in-class-range ,/parser-jmp STR2
,/lookbehind LDR ( last char on stack )
p2brk }
DEIk LIT "\ NEQ ?{ POP ( backslash escape )
;/in-class-bs ,/parser-jmp STR2
p2brk }
POP /in-fixed p2brk
@lut/in-class-bs ( -- )
;/in-class ,/parser-jmp STR2 ( reset the escape )
/in-fixed p2brk
( placed here for STR purposes )
@lut/in ( -> )
( first, store the char )
[ LIT2 &lookbehind $2 ]
NIP #12 DEI ,/lookbehind STR2
( then, update the pattern however )
[ LIT2 &parser-jmp =/in-fixed ] JSR2
( then update the bitmask )
,/len LDR2 ( get the length )
#0010 ADD2 ( calculate new counter value )
,/len STR2 ( save it )
BRK
@lut/get-mask ( -- mask* )
#0001 ( gets shifted later )
[ LIT2 &len 0000 ] ( how far into the pattern we are )
SWP ?toolong ( die, pattern is > 16 bytes )
SFT2 #ffff EOR2 ( nth bit is 0, all others are 1 )
JMP2r
@lut/in-pattern ( -- )
#12
DEIk LIT "\ NEQ ?{ POP ( backslash escape )
;/in-bs ,/parser-jmp STR2
p2brk } ( don't shift mask )
DEIk LIT "[ NEQ ?{ POP ( character class )
;/in-class1 ,/parser-jmp STR2
p2brk } ( don't shift mask )
DEIk LIT "? NEQ ?{ POP !/col-negate } ( will allow anything )
POP !/in-fixed ( non-special )
@lut/in-bs ( -- )
;/in-pattern ,/parser-jmp STR2 ( reset the escape )
!/in-fixed
@lut/in-class-range ( char1 -- )
#00 SWP #10 SFT2 ( 2*char1* )
;lut ADD2 ( start* )
#12 DEI ( start* char2 )
#00 SWP #10 SFT2 ( start* 2*char2* )
;lut ADD2 INC2 INC2 ( start* end* )
STH2 STH2 /get-mask ( `end* `start* mask* )
&range-loop
DUP2 STH2rk LDA2 AND2
STH2rk STA2
INC2r INC2r
NEQ2rk STHr ?/range-loop
POP2r POP2r POP2
;/in-class ;/parser-jmp STA2
p2brk
@lut/get-addr ( -- addr* )
#0012 DEI ( input byte, prefixed with 00 )
#10 SFT2 ;lut ADD2 ( offset in lookup table )
JMP2r
|1000
@lut ( ... ) $200 ( ... ) @lut/end
@bitap/line ( the incoming line gets written here )