~aleteoryx/uxn

5a6ef04a7c2887d96e43b7c03db125224f031afd — Aleteoryx 2 months ago 8c21bd0
character classes, wildcards
1 files changed, 134 insertions(+), 21 deletions(-)

M grep.tal
M grep.tal => grep.tal +134 -21
@@ 5,12 5,16 @@
	search algorithm, for fixed search patterns up to 16 bytes )


%dbg { #010e DEO }
%kil { #010f DEO BRK }
%p2brk { POP2r BRK }

( --- initialization --- )

|100
@on-reset ( -> )
	;meta #06 DEO2
	;args #10 DEO2
	;args/checkdash #10 DEO2

	#0017 DEI EQU ?usage	( die if no args )
	;lut


@@ 27,12 31,19 @@
	
	"bezza-yates-gonet 20 ""bitap" 20 "text 20 "search 0a00

@args ( -> )
@args/checkdash ( -> )
	[ LIT2 "- 12 ] DEI NEQ ?{
		;args/optstr #10 DEO2 BRK
	}
	;args/lastarg #10 DEO2
	( >> )

@args/lastarg ( -> )
	#17 DEI
	#02 DUP2 NEQ ?{ POP2 !lut/in }	( argument byte, parse )
	INC      EQU ?usage		( only accept one arg )	
	INC      EQU ?usage		( only accept one string arg )	
	
	( must be 04 )
	( must be 04, end of args )
	
	( we need to shift state up 10 - lut/len bits before orring it
	  with match, this is where that's calculated. )


@@ 42,13 53,35 @@
	;bitap/in #10 DEO2
	[ LITr 80 ] #ffff	( prep the stack for bitap/in )
	BRK

@args/optstr ( -> )
	#17 DEI
	#03 DUP2 NEQ ?{		( end of -args? )
		POP2 ;args/lastarg #10 DEO2 BRK
	}
	INC DUP2 EQU ?usage	( we can't be the only arg )
	POP2
	
	#12
	DEIk LIT "F NEQ ?{ POP BRK }	( [F]ixed mode -- do nothing )
	DEIk LIT "- NEQ ?{ POP BRK }	( for '--' arg )

	DEIk LIT "P NEQ ?{ POP		( [P]attern mode )
		;lut/in-pattern ;lut/parser-jmp STA2
	BRK }
	DEIk LIT "o NEQ ?{ POP BRK }
	
	DEI ,unknopt/opt STR !unknopt

( --- printing --- )

@usage/str "usage: 20 "<program> 20 "| 20 "grep.rom 20 "PATTERN 0a
@usage/str "usage: 20 "<program> 20 "| 20
	   "grep.rom 20 "[-FPo 20 "| 20 "--] 20 "PATTERN 0a
@usage ( -- )
	#01 ;/str !die
@unknopt/str "fatal: 20 "unknown 20 "option 20 "- &opt $1 0a
@unknopt ( -- )
	#01 ;/str !die
@toolong/str "fatal: 20 "pattern 20 "must 20 "be 20 "<16 20 "bytes. 0a
@toolong ( -- )
	#02 ;/str ( >> )


@@ 72,7 105,7 @@

	( shift-or )
	#10 SFT2		( SHIFT in a 0 )
	lut/getaddr LDA2 ORA2	( OR with LUT value )
	lut/get-addr LDA2 ORA2	( OR with LUT value )
	
	( matches are stored in the high bit of the top byte of the
	  return stack. if the high bit is low [i.e. we got a match],


@@ 94,31 127,111 @@
	} BRK


( --- look-up table --- )
( --- look-up table generation & pattern parsing --- )

@lut/in-fixed ( -- )
	/get-addr LDA2k	( get current mask )
	/get-mask AND2	( compute new value for slot )
	SWP2 STA2	( write new mask )
	JMP2r

@lut/in-wildcard ( -- )
	/get-mask [ LIT2r =lut/end LIT2r =lut ]	( mask* `end* `addr* )
	&wc-loop
		DUP2 STH2rk LDA2 AND2		( calculate new mask )
		STH2rk STA2			( save it )
		INC2r INC2r			( inc. addr )
		NEQ2rk STHr ?/wc-loop		( loop )
	POP2r POP2r POP2
	JMP2r


@lut/in-class1 ( -- )
	;/in-class ,/parser-jmp STR2		( only do this once )
	LIT2 "- 12 DEI NEQ ?{			( leading dash )
		/in-fixed
	p2brk }
	( >> )
@lut/in-class ( -- )
	#12
	DEIk LIT "] NEQ ?{ POP			( end of class )
		;/in-pattern ,/parser-jmp STR2
	JMP2r }
	DEIk LIT "- NEQ ?{ POP			( range section )
		;/in-class-range ,/parser-jmp STR2
		,/lookbehind LDR		( last char on stack )
	p2brk }
	DEIk LIT "\ NEQ ?{ POP			( backslash escape )
		;/in-class-bs ,/parser-jmp STR2
	p2brk }

	POP /in-fixed p2brk

@lut/in-class-bs ( -- )
	;/in-class ,/parser-jmp STR2	( reset the escape )
	/in-fixed p2brk


( placed here for STR purposes )
@lut/in ( -> )
	( first, compute the LUT bit )
	( first, store the char )
	[ LIT2 &lookbehind $2 ]
	NIP #12 DEI ,/lookbehind STR2

	( then, update the pattern however )
	[ LIT2 &parser-jmp =/in-fixed ] JSR2

	( then update the bitmask )
	,/len LDR2	( get the length )
	#0010 ADD2	( calculate new counter value )
	,/len STR2	( save it )
	BRK

@lut/get-mask ( -- mask* )
	#0001			( gets shifted later )
	[ LIT2 &len 0000 ]	( how far into the pattern we are )
	STH2k			( save a byte later )
	SWP ?toolong		( die, pattern is > 16 bytes )
	SFT2 #ffff EOR2		( nth bit is 0, all others are 1 )
	
	( next, write it into the LUT )
	/getaddr
	STH2k LDA2 AND2		( compute new value for slot )
	STH2r STA2		( write new value )
	
	( last, update the counter )
	STH2r #0010 ADD2	( calculate new counter value )
	,/len STR2		( save it )
	BRK
	JMP2r

@lut/getaddr ( -- addr* )
@lut/in-pattern ( -- )
	#12
	DEIk LIT "\ NEQ ?{ POP			( backslash escape )
		;/in-bs ,/parser-jmp STR2
	p2brk }					( don't shift mask )
	DEIk LIT "[ NEQ ?{ POP			( character class )
		;/in-class1 ,/parser-jmp STR2
	p2brk }					( don't shift mask )

	DEIk LIT "? NEQ ?{ POP !/in-wildcard }	( will allow anything )

	POP !/in-fixed				( non-special )
@lut/in-bs ( -- )
	;/in-pattern ,/parser-jmp STR2	( reset the escape )
	!/in-fixed

@lut/in-class-range ( char1 -- )
	#00 SWP #10 SFT2	( 2*char1* )
	;lut ADD2		( start* ) 
	#12 DEI			( start* char2 )
	#00 SWP #10 SFT2	( start* 2*char2* )
	;lut ADD2 INC2 INC2	( start* end* )
	STH2 STH2 /get-mask	( `end* `start* mask* )
	&range-loop
		DUP2 STH2rk LDA2 AND2
		STH2rk STA2
		INC2r INC2r
		NEQ2rk dbg STHr ?/range-loop
	POP2r POP2r POP2
	;/in-class ;/parser-jmp STA2
	p2brk


@lut/get-addr ( -- addr* )
	#0012 DEI		( input byte, prefixed with 00 )
	#10 SFT2 ;lut ADD2	( offset in lookup table )
	JMP2r


|1000
@lut ( ... ) $200 ( ... ) @lut/end
@bitap/line ( the incoming line gets written here )