~aleteoryx/uxn

ref: 4e71c83942a68c561cb5624174ae37650027d728 uxn/grep.tal -rw-r--r-- 6.2 KiB
4e71c839Aleteoryx grep's -o flag 2 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
(	grep.tal
	by aleteoryx, all rights released
	
	an implementation of the bezza-yates-gonet "shift-or" bitap
	search algorithm, for fixed search patterns up to 16 bytes )


%dbg { #010e DEO }
%kil { #010f DEO BRK }
%p2brk { BRK }	( replace this with POP2r BRK when debugging )

( --- initialization --- )

|100
@on-reset ( -> )
	;meta #06 DEO2
	;args/checkdash #10 DEO2

	#0017 DEI EQU ?usage	( die if no args )
	;lut
	( >> )
@fill-lut ( ptr* -> )
	#ffff OVR2 STA2			( write ffff to LUT )
	INC2 INC2			( offset )
	DUP2 ;lut/end NEQ2 ?fill-lut	( loop if not at end )
	POP2
	BRK
@meta
	"grep.rom 0a
	"aleteoryx, 20 "all 20 "rights 20 "released 0a0a
	
	"bezza-yates-gonet 20 ""bitap" 20 "text 20 "search 0a00

@args/checkdash ( -> )
	[ LIT2 "- 12 ] DEI NEQ ?{
		;args/optstr #10 DEO2 BRK
	}
	;args/lastarg #10 DEO2
	( >> )

@args/lastarg ( -> )
	#17 DEI
	#02 DUP2 NEQ ?{ POP2 !lut/in }	( argument byte, parse )
	INC      EQU ?usage		( only accept one string arg )	
	
	( must be 04, end of args )
	
	( we need to shift state up 10 - lut/len bits before orring it
	  with match, this is where that's calculated. )
	;lut/len LDA2 SUB 	( the high byte will be 00 )
	;bitap/sft STA
	
	;bitap/in #10 DEO2
	[ LITr 80 ] #ffff	( prep the stack for bitap/in )
	BRK

@args/optstr ( -> )
	#17 DEI
	#03 DUP2 NEQ ?{		( end of -args? )
		POP2 ;args/lastarg #10 DEO2 BRK
	}
	INC DUP2 EQU ?usage	( we can't be the only arg )
	POP2
	
	#12
	DEIk LIT "F NEQ ?{ POP BRK }	( [F]ixed mode -- do nothing )
	DEIk LIT "- NEQ ?{ POP BRK }	( for '--' arg )

	DEIk LIT "P NEQ ?{ POP		( [P]attern mode )
		;lut/in-pattern ;lut/parser-jmp STA2
	BRK }
	DEIk LIT "o NEQ ?{ POP		( [o]nly print matches )
		#01 ;bitap/only-flag STA
	BRK }
	
	DEI ,unknopt/opt STR !unknopt

( --- printing --- )

@usage/str "usage: 20 "<program> 20 "| 20
	   "grep.rom 20 "[-FPo 20 "| 20 "--] 20 "PATTERN 0a
@usage ( -- )
	#01 ;/str !die
@unknopt/str "fatal: 20 "unknown 20 "option 20 "- &opt $1 0a
@unknopt ( -- )
	#01 ;/str !die
@toolong/str "fatal: 20 "pattern 20 "must 20 "be 20 "<16 20 "bytes. 0a
@toolong ( -- )
	#02 ;/str ( >> )
@die ( code str* -- )
	&loop
		LDAk DUP #19 DEO	( get and write char )
		STH INC2 STHr		( increment str* )
		#0a NEQ ?/loop		( loop if char != 0a )
	POP2 #0f DEO BRK		( exit with status XX )


( --- check for a match --- )

@bitap/in ( state* `match -> state* `match )
	( handle the line buffer )
	#0a12 DEI DUP		( get char, save a byte later )
	[ LIT2 &ptr =/line ]	( line pointer )
	STH2k STA STH2r		( save byte in buffer )
	INC2 ,/ptr STR2		( save new pointer )
	EQU ?/newline		( reset state on newline )

	( shift-or )
	#10 SFT2		( SHIFT in a 0 )
	lut/get-addr LDA2 ORA2	( OR with LUT value )
		
	( matches are stored in the high bit of the top byte of the
	  return stack. if the high bit is low [i.e. we got a match],
	  this will set match to 00, otherwise it will stay 80. )
	DUP2 [ LIT &sft $1 ] SFT2
	POP STH ANDr
	[ LIT &only-flag 00 ] ?/do-only
	BRK

@bitap/newline ( state* `match -> state* `match )
	;/line ,/ptr STR2	( reset line ptr )
	POP2 #ffff		( reset state )
	STHr [ LITr 80 ]	( reset match )
	,/only-flag LDR ORA	( only mode? )
	?{			( we saw a match, print the line )
		;/line
		&loop
			LDAk DUP #18 DEO	( get and write char )
			STH INC2 STHr		( increment str* )
			#0a NEQ ?/loop		( loop if char != 0a )
		POP2
	} BRK

@bitap/do-only ( `match -- `match )
	STHrk ?{
		POPr [ LITr 80 ]	( reset flag )
		,/ptr LDR2 DUP2		( offset into line )
		;lut/len LDA2		( get length to print )
		#04 SFT			( get actual length )
		SUB2 SWP2		( start* end* )
		#0a ROT ROT STA	( write a newline )
		&only-loop
			LDAk DUP #18 DEO
			STH INC2 STHr
			#0a NEQ ?/only-loop
		POP2
	} BRK


( --- look-up table generation & pattern parsing --- )

@lut/in-fixed ( -- )
	/get-addr LDA2k	( get current mask )
	/get-mask AND2	( compute new value for slot )
	SWP2 STA2	( write new mask )
	JMP2r

@lut/in-wildcard ( -- )
	/get-mask [ LIT2r =lut/end LIT2r =lut ]	( mask* `end* `addr* )
	&wc-loop
		DUP2 STH2rk LDA2 AND2		( calculate new mask )
		STH2rk STA2			( save it )
		INC2r INC2r			( inc. addr )
		NEQ2rk STHr ?/wc-loop		( loop )
	POP2r POP2r POP2
	JMP2r


@lut/in-class1 ( -- )
	;/in-class ,/parser-jmp STR2		( only do this once )
	LIT2 "- 12 DEI NEQ ?{			( leading dash )
		/in-fixed
	p2brk }
	( >> )
@lut/in-class ( -- )
	#12
	DEIk LIT "] NEQ ?{ POP			( end of class )
		;/in-pattern ,/parser-jmp STR2
	JMP2r }
	DEIk LIT "- NEQ ?{ POP			( range section )
		;/in-class-range ,/parser-jmp STR2
		,/lookbehind LDR		( last char on stack )
	p2brk }
	DEIk LIT "\ NEQ ?{ POP			( backslash escape )
		;/in-class-bs ,/parser-jmp STR2
	p2brk }

	POP /in-fixed p2brk

@lut/in-class-bs ( -- )
	;/in-class ,/parser-jmp STR2	( reset the escape )
	/in-fixed p2brk


( placed here for STR purposes )
@lut/in ( -> )
	( first, store the char )
	[ LIT2 &lookbehind $2 ]
	NIP #12 DEI ,/lookbehind STR2

	( then, update the pattern however )
	[ LIT2 &parser-jmp =/in-fixed ] JSR2

	( then update the bitmask )
	,/len LDR2	( get the length )
	#0010 ADD2	( calculate new counter value )
	,/len STR2	( save it )
	BRK

@lut/get-mask ( -- mask* )
	#0001			( gets shifted later )
	[ LIT2 &len 0000 ]	( how far into the pattern we are )
	SWP ?toolong		( die, pattern is > 16 bytes )
	SFT2 #ffff EOR2		( nth bit is 0, all others are 1 )
	JMP2r

@lut/in-pattern ( -- )
	#12
	DEIk LIT "\ NEQ ?{ POP			( backslash escape )
		;/in-bs ,/parser-jmp STR2
	p2brk }					( don't shift mask )
	DEIk LIT "[ NEQ ?{ POP			( character class )
		;/in-class1 ,/parser-jmp STR2
	p2brk }					( don't shift mask )

	DEIk LIT "? NEQ ?{ POP !/in-wildcard }	( will allow anything )

	POP !/in-fixed				( non-special )
@lut/in-bs ( -- )
	;/in-pattern ,/parser-jmp STR2	( reset the escape )
	!/in-fixed

@lut/in-class-range ( char1 -- )
	#00 SWP #10 SFT2	( 2*char1* )
	;lut ADD2		( start* ) 
	#12 DEI			( start* char2 )
	#00 SWP #10 SFT2	( start* 2*char2* )
	;lut ADD2 INC2 INC2	( start* end* )
	STH2 STH2 /get-mask	( `end* `start* mask* )
	&range-loop
		DUP2 STH2rk LDA2 AND2
		STH2rk STA2
		INC2r INC2r
		NEQ2rk STHr ?/range-loop
	POP2r POP2r POP2
	;/in-class ;/parser-jmp STA2
	p2brk


@lut/get-addr ( -- addr* )
	#0012 DEI		( input byte, prefixed with 00 )
	#10 SFT2 ;lut ADD2	( offset in lookup table )
	JMP2r

|1000
@lut ( ... ) $200 ( ... ) @lut/end
@bitap/line ( the incoming line gets written here )