From 5a6ef04a7c2887d96e43b7c03db125224f031afd Mon Sep 17 00:00:00 2001 From: Aleteoryx Date: Tue, 16 Sep 2025 22:43:28 -0400 Subject: [PATCH] character classes, wildcards --- grep.tal | 155 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 134 insertions(+), 21 deletions(-) diff --git a/grep.tal b/grep.tal index cebae3a45a33eabc1a67d47b1cb390aecc294e18..4536dca5a054bb58245f39d5eb3fecd293eac40d 100644 --- a/grep.tal +++ b/grep.tal @@ -5,12 +5,16 @@ search algorithm, for fixed search patterns up to 16 bytes ) +%dbg { #010e DEO } +%kil { #010f DEO BRK } +%p2brk { POP2r BRK } + ( --- initialization --- ) |100 @on-reset ( -> ) ;meta #06 DEO2 - ;args #10 DEO2 + ;args/checkdash #10 DEO2 #0017 DEI EQU ?usage ( die if no args ) ;lut @@ -27,12 +31,19 @@ "bezza-yates-gonet 20 ""bitap" 20 "text 20 "search 0a00 -@args ( -> ) +@args/checkdash ( -> ) + [ LIT2 "- 12 ] DEI NEQ ?{ + ;args/optstr #10 DEO2 BRK + } + ;args/lastarg #10 DEO2 + ( >> ) + +@args/lastarg ( -> ) #17 DEI #02 DUP2 NEQ ?{ POP2 !lut/in } ( argument byte, parse ) - INC EQU ?usage ( only accept one arg ) + INC EQU ?usage ( only accept one string arg ) - ( must be 04 ) + ( must be 04, end of args ) ( we need to shift state up 10 - lut/len bits before orring it with match, this is where that's calculated. ) @@ -42,13 +53,35 @@ ;bitap/in #10 DEO2 [ LITr 80 ] #ffff ( prep the stack for bitap/in ) BRK + +@args/optstr ( -> ) + #17 DEI + #03 DUP2 NEQ ?{ ( end of -args? ) + POP2 ;args/lastarg #10 DEO2 BRK + } + INC DUP2 EQU ?usage ( we can't be the only arg ) + POP2 + #12 + DEIk LIT "F NEQ ?{ POP BRK } ( [F]ixed mode -- do nothing ) + DEIk LIT "- NEQ ?{ POP BRK } ( for '--' arg ) + + DEIk LIT "P NEQ ?{ POP ( [P]attern mode ) + ;lut/in-pattern ;lut/parser-jmp STA2 + BRK } + DEIk LIT "o NEQ ?{ POP BRK } + + DEI ,unknopt/opt STR !unknopt ( --- printing --- ) -@usage/str "usage: 20 " 20 "| 20 "grep.rom 20 "PATTERN 0a +@usage/str "usage: 20 " 20 "| 20 + "grep.rom 20 "[-FPo 20 "| 20 "--] 20 "PATTERN 0a @usage ( -- ) #01 ;/str !die +@unknopt/str "fatal: 20 "unknown 20 "option 20 "- &opt $1 0a +@unknopt ( -- ) + #01 ;/str !die @toolong/str "fatal: 20 "pattern 20 "must 20 "be 20 "<16 20 "bytes. 0a @toolong ( -- ) #02 ;/str ( >> ) @@ -72,7 +105,7 @@ ( shift-or ) #10 SFT2 ( SHIFT in a 0 ) - lut/getaddr LDA2 ORA2 ( OR with LUT value ) + lut/get-addr LDA2 ORA2 ( OR with LUT value ) ( matches are stored in the high bit of the top byte of the return stack. if the high bit is low [i.e. we got a match], @@ -94,31 +127,111 @@ } BRK -( --- look-up table --- ) +( --- look-up table generation & pattern parsing --- ) + +@lut/in-fixed ( -- ) + /get-addr LDA2k ( get current mask ) + /get-mask AND2 ( compute new value for slot ) + SWP2 STA2 ( write new mask ) + JMP2r + +@lut/in-wildcard ( -- ) + /get-mask [ LIT2r =lut/end LIT2r =lut ] ( mask* `end* `addr* ) + &wc-loop + DUP2 STH2rk LDA2 AND2 ( calculate new mask ) + STH2rk STA2 ( save it ) + INC2r INC2r ( inc. addr ) + NEQ2rk STHr ?/wc-loop ( loop ) + POP2r POP2r POP2 + JMP2r + +@lut/in-class1 ( -- ) + ;/in-class ,/parser-jmp STR2 ( only do this once ) + LIT2 "- 12 DEI NEQ ?{ ( leading dash ) + /in-fixed + p2brk } + ( >> ) +@lut/in-class ( -- ) + #12 + DEIk LIT "] NEQ ?{ POP ( end of class ) + ;/in-pattern ,/parser-jmp STR2 + JMP2r } + DEIk LIT "- NEQ ?{ POP ( range section ) + ;/in-class-range ,/parser-jmp STR2 + ,/lookbehind LDR ( last char on stack ) + p2brk } + DEIk LIT "\ NEQ ?{ POP ( backslash escape ) + ;/in-class-bs ,/parser-jmp STR2 + p2brk } + + POP /in-fixed p2brk + +@lut/in-class-bs ( -- ) + ;/in-class ,/parser-jmp STR2 ( reset the escape ) + /in-fixed p2brk + + +( placed here for STR purposes ) @lut/in ( -> ) - ( first, compute the LUT bit ) + ( first, store the char ) + [ LIT2 &lookbehind $2 ] + NIP #12 DEI ,/lookbehind STR2 + + ( then, update the pattern however ) + [ LIT2 &parser-jmp =/in-fixed ] JSR2 + + ( then update the bitmask ) + ,/len LDR2 ( get the length ) + #0010 ADD2 ( calculate new counter value ) + ,/len STR2 ( save it ) + BRK + +@lut/get-mask ( -- mask* ) #0001 ( gets shifted later ) [ LIT2 &len 0000 ] ( how far into the pattern we are ) - STH2k ( save a byte later ) SWP ?toolong ( die, pattern is > 16 bytes ) SFT2 #ffff EOR2 ( nth bit is 0, all others are 1 ) - - ( next, write it into the LUT ) - /getaddr - STH2k LDA2 AND2 ( compute new value for slot ) - STH2r STA2 ( write new value ) - - ( last, update the counter ) - STH2r #0010 ADD2 ( calculate new counter value ) - ,/len STR2 ( save it ) - BRK + JMP2r -@lut/getaddr ( -- addr* ) +@lut/in-pattern ( -- ) + #12 + DEIk LIT "\ NEQ ?{ POP ( backslash escape ) + ;/in-bs ,/parser-jmp STR2 + p2brk } ( don't shift mask ) + DEIk LIT "[ NEQ ?{ POP ( character class ) + ;/in-class1 ,/parser-jmp STR2 + p2brk } ( don't shift mask ) + + DEIk LIT "? NEQ ?{ POP !/in-wildcard } ( will allow anything ) + + POP !/in-fixed ( non-special ) +@lut/in-bs ( -- ) + ;/in-pattern ,/parser-jmp STR2 ( reset the escape ) + !/in-fixed + +@lut/in-class-range ( char1 -- ) + #00 SWP #10 SFT2 ( 2*char1* ) + ;lut ADD2 ( start* ) + #12 DEI ( start* char2 ) + #00 SWP #10 SFT2 ( start* 2*char2* ) + ;lut ADD2 INC2 INC2 ( start* end* ) + STH2 STH2 /get-mask ( `end* `start* mask* ) + &range-loop + DUP2 STH2rk LDA2 AND2 + STH2rk STA2 + INC2r INC2r + NEQ2rk dbg STHr ?/range-loop + POP2r POP2r POP2 + ;/in-class ;/parser-jmp STA2 + p2brk + + +@lut/get-addr ( -- addr* ) #0012 DEI ( input byte, prefixed with 00 ) #10 SFT2 ;lut ADD2 ( offset in lookup table ) JMP2r - +|1000 @lut ( ... ) $200 ( ... ) @lut/end @bitap/line ( the incoming line gets written here )