;;; Parse Spencer-style regexps into the regexp ADT. ;;; Olin Shivers, July 1998. ;;; One export: (posix-string->regexp s) ;;; Need better error checking on {m,n} brace parsing. (define-module (scsh rx spencer) :use-module (ice-9 receive) :use-module (srfi srfi-14) :use-module (scsh ascii) :use-module (scsh rx re)) (export posix-string->regexp) (define (parse-posix-regexp-string s) (receive (re i) (parse-posix-exp s 0) (if (= i (string-length s)) re (error "Illegal Posix regexp -- terminated early" s i)))) (define posix-string->regexp parse-posix-regexp-string) ;;; An complete expression is a sequence of |-separated branches. (define (parse-posix-exp s i) (let ((len (string-length s))) (if (< i len) (let lp ((i i) (branches '())) (receive (branch i) (parse-posix-branch s i) (let ((branches (cons branch branches))) (if (and (< i len) (char=? #\| (string-ref s i))) (lp (+ i 1) branches) (values (re-choice (reverse branches)) i))))) (values re-trivial i)))) ;;; A branch is a sequence of pieces -- stuff that goes in-between |'s. (define (parse-posix-branch s i) (let ((len (string-length s))) (let lp ((i i) (pieces '())) (if (< i len) (receive (piece i) (parse-posix-piece s i) (let ((pieces (cons piece pieces))) (if (< i len) (case (string-ref s i) ((#\) #\|) (values (re-seq (reverse pieces)) i)) (else (lp i pieces))) (values (re-seq (reverse pieces)) i)))) (values (re-seq (reverse pieces)) i))))) ;;; A piece is an atom possibly followed by a * ? + or {...} multiplier. ;;; I.e. an element of a branch sequence. (define (parse-posix-piece s i) (let ((len (string-length s))) (receive (atom i) (parse-posix-atom s i) (if (< i len) (case (string-ref s i) ((#\* #\+ #\?) (receive (from to) (case (string-ref s i) ((#\*) (values 0 #f)) ((#\+) (values 1 #f)) ((#\?) (values 0 1))) (values (re-repeat from to atom) (+ i 1)))) ((#\{) (receive (from to i) (parse-posix-braces s (+ i 1)) (values (re-repeat from to atom) i))) (else (values atom i))) (values atom i))))) ;;; An atom is something that would bind to a following * operator -- ;;; a letter, [...] charset, ^, $, or (...). (define (parse-posix-atom s i) (let ((len (string-length s))) (if (< i (string-length s)) (let ((c (string-ref s i))) (case c ((#\^) (values re-bos (+ i 1))) ((#\$) (values re-eos (+ i 1))) ((#\.) (values re-any (+ i 1))) ((#\[) (parse-posix-bracket s (+ i 1))) ((#\() (receive (re i) (parse-posix-exp s (+ i 1)) (if (and (< i len) (char=? #\) (string-ref s i))) (values (re-submatch re) (+ i 1)) (error "Regexp subexpression has no terminating close parenthesis" s i)))) ((#\\) (let ((i (+ i 1))) (if (< i len) (values (make-re-string (string (string-ref s i))) (+ i 1)) (error "Regexps may not terminate with a backslash" s)))) ((#\) #\| #\* #\+ #\? #\{) (values re-trivial i)) (else (values (make-re-string (string c)) (+ i 1))))) (values re-trivial i)))) ;;; Parse a [...] or [^...] bracket expression into a regexp. ;;; I is the index of the char following the left bracket. (define db-cset (char-set #\. #\= #\:)) ; Not allowed after a #\[. (define (parse-posix-bracket s i) (let ((len (string-length s))) (if (>= i len) (error "Missing close right bracket in regexp" s i) (receive (negate? i0) (let ((c (string-ref s i))) (if (char=? c #\^) (values #t (+ i 1)) (values #f i))) (let lp ((i i0) (cset (char-set-copy char-set:empty))) (if (>= i len) (error "Missing close right bracket in regexp" s i) (let ((c (string-ref s i)) (i1 (+ i 1))) (case c ((#\[) ;; We don't handle [..] [==] [::] frobs. (if (and (< i1 len) (char-set-contains? db-cset (string-ref s i1))) (error "double-bracket regexps not supported." s i) (lp i1 (char-set-adjoin! cset #\[)))) ((#\]) (if (= i i0) (lp i1 (char-set-adjoin! cset #\])) (let ((cset (if negate? (char-set-complement! cset) cset))) (values (make-re-char-set cset) i1)))) ((#\-) (if (or (= i i0) ; first char or last char (and (< i1 len) (char=? #\] (string-ref s i1)))) (lp i1 (char-set-adjoin! cset #\-)) (error "Illegal - in [...] regexp" s i))) ;; Regular letter -- either alone, or startpoint of a range. (else (if (and (< (+ i1 1) len) (char=? #\- (string-ref s i1))) ;; Range (let* ((i-tochar (+ i1 1)) (to (char->ascii (string-ref s i-tochar)))) (do ((j (char->ascii c) (+ j 1)) (cset cset (char-set-adjoin! cset (ascii->char j)))) ((> j to) (lp (+ i-tochar 1) cset)))) ;; Just a letter (lp i1 (char-set-adjoin! cset c)))))))))))) ;;; Parse out a [from,to] repetition pair from a {m,n} {m} or {m,} expression. ;;; I is the index of the char following the left brace. (define (parse-posix-braces s i) (let ((comma (string-index s #\, i)) (rb (string-index s #\} i))) (if rb (if (and comma (< comma rb)) (values (string->number (substring s i comma)) (and (not (= (+ comma 1) rb)) (string->number (substring s (+ comma 1) rb))) (+ rb 1)) (let ((m (string->number (substring s i rb)))) (values m m (+ rb 1)))) (error "Missing close brace in regexp" s i))))