;;; Olin Shivers, June 1998 ;;; Copyright (c) 1998 by the Scheme Underground. ;;; One export: (simplify-regexp re) -> re ;;; Regexp simplifier ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...) ;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...) ;;; (c2 ...)]) ;;; A run of BOS's or a run of EOS's in a sequence may be elided. ;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's" ;;; together; multiply the "to's" together. ;;; Exponent range [1,1] simplifies, as does [0,0]. ;;; Uniquify branches ;;; Adjacent literals in a sequence can be collapsed ;;; A singleton-char char class can be collapsed into a constant ;;; Nested choices can be collapsed ;;; Nested sequences can be collapsed ;;; An empty sequence (:) can be turned into an empty-string match "". ;;; Singleton choices and sequences can be reduced to their body. ;;; ;;; The simplifier is carefully written so that it won't blow up ;;; when applied to a dynamic regexp -- that is, ;;; - a chunk of Scheme code that produces a regexp instead of ;;; an actual regexp value; ;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code ;;; rather than integers; ;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather ;;; than an actual char-set value. ;;; This is useful because the RX macro can build such a regexp as part ;;; of its expansion process. (define-module (scsh rx simp) :use-module (ice-9 receive) :use-module (scsh rx re) :use-module (scsh rx cond-package) :use-module (srfi srfi-14) ) (export simplify-regexp) (define (simplify-regexp re) (receive (simp-re pre-dsm) (simp-re re) (re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm))))) (define (simp-re re) (? ((re-string? re) (values re 0)) ((re-seq? re) (simp-seq re)) ((re-choice? re) (simp-choice re)) ;; Singleton char-sets reduce to the character. ;; Bear in mind the cset field might be Scheme code instead ;; of an actual char set if the regexp is dynamic. ((re-char-set? re) (values (let ((cs (re-char-set:cset re))) (if (and (char-set? cs) (= 1 (char-set-size cs))) (make-re-string (string (car (char-set->list cs)))) re)) 0)) ((re-repeat? re) (simp-repeat re)) ((re-submatch? re) (simp-submatch re)) ((re-dsm? re) (simp-dsm re)) (else (values re 0)))) ;;; If the body of a submatch is the empty re, reduce it to the empty re. (define (simp-submatch re) (let ((tsm (re-submatch:tsm re)) (pre-dsm (re-submatch:pre-dsm re))) (receive (body1 pre-dsm1) (simp-re (re-submatch:body re)) (if (re-empty? body1) (values re-empty tsm) (values (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm) 0))))) ;;; - Flatten nested DSM's. ;;; - Return pre-dsm field and body field as the two return values. (define (simp-dsm re) (receive (body pre-dsm1) (simp-re (re-dsm:body re)) (values body (+ (re-dsm:pre-dsm re) pre-dsm1)))) ;;; Simplifying sequences ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; - Collapse nested sequences and DSM's. ;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.). ;;; - Bubble DSM's forwards past elts that don't contain live submatches. ;;; (Going past live submatches would switch the submatch indexes around, ;;; which would be an error). This helps to coalesce DSMs and if we bring ;;; them all the way to the front, we can pop them off and make them a ;;; pre-dsm for the entire seq record. ;;; - If an elt is the re-empty, reduce the whole re to the empty re. ;;; - Reduce singleton and empty seq. (define (simp-seq re) (let ((tsm (re-seq:tsm re)) (elts (map simplify-regexp (re-seq:elts re)))) (if (pair? elts) (call-with-current-continuation (lambda (abort) (receive (pre-dsm head tail) (simp-seq1 elts abort tsm) (values (if (pair? tail) (%make-re-seq (cons head tail) (- tsm pre-dsm)) head) ; Singleton seq pre-dsm)))) (values re-trivial 0)))) ; Empty seq ;;; Simplify the non-empty sequence ELTS. ;;; - Return the result split out into three values: ;;; [head-elt-pre-dsm, head-elt, tail]. ;;; - If any elt is the empty (impossible) re, abort by calling ;;; (abort elt tsm). TSM is otherwise unused. (define (simp-seq1 elts abort tsm) (let recur ((elt (car elts)) (elts (cdr elts))) (receive (elt pre-dsm) (open-dsm elt) (? ((re-seq? elt) ; Flatten nested seqs. (let ((sub-elts (re-seq:elts elt))) (recur (re-dsm (car sub-elts) pre-dsm 0) (append (cdr sub-elts) elts)))) ((re-empty? elt) (abort elt tsm)) ; Bomb out on the empty ; (impossible) re. ((pair? elts) (receive (next-pre-dsm next tail) ; Simplify the tail, (recur (car elts) (cdr elts)) ; then think about ; the head: ;; This guy is called when we couldn't find any other ;; simplification. If ELT contains live submatches, then ;; there really is nothing to be done at this step -- just ;; assemble the pieces together and return them. If ELT ;; *doesn't* contain any live submatches, do the same, but ;; bubble its following next-pre-dsm submatches forwards. (define (no-simp) (if (has-live-submatches? elt) (values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail)) (values (+ pre-dsm next-pre-dsm) elt (cons next tail)))) ;; Coalesces two adjacent bol's, two adjacent eol's, etc. (define (coalesce-anchor anchor?) (if (and (anchor? elt) (anchor? next)) (values (+ pre-dsm next-pre-dsm) elt tail) (no-simp))) (? ((re-trivial? elt) ; Drop trivial re's. (values (+ pre-dsm next-pre-dsm) next tail)) ;; Coalesce adjacent strings ((re-string? elt) (if (re-string? next) (values (+ pre-dsm next-pre-dsm) (make-re-string (string-append (re-string:chars elt) (re-string:chars next))) tail) (no-simp))) ;; Coalesce adjacent bol/eol/bos/eos/bow/eow's. ((re-bol? elt) (coalesce-anchor re-bol?)) ((re-eol? elt) (coalesce-anchor re-eol?)) ((re-bos? elt) (coalesce-anchor re-bos?)) ((re-eos? elt) (coalesce-anchor re-eos?)) ((re-bow? elt) (coalesce-anchor re-bow?)) ((re-eow? elt) (coalesce-anchor re-eow?)) (else (no-simp))))) (else (values pre-dsm elt '())))))) ;;; Simplifying choices ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; - Collapse nested choices and DSM's. ;;; - Delete re-empty's. ;;; - Merge sets; merge identical anchors (bos, eos, etc.). ;;; But you can't merge across an element that contains a live submatch, ;;; see below. ;;; - A singleton string "c" is included into the char-set merge as a ;;; singleton set. ;;; - Bubble DSM's forwards past elts that don't contain live submatches. ;;; (Going past live submatches would switch the submatch indexes around, ;;; which would be an error). This helps to coalesce DSMs and if we bring ;;; them all the way to the front, we can pop them off and make them a ;;; pre-dsm for the entire seq record. ;;; - Reduce singleton and empty choice. ;;; ;;; You have to be careful simplifying choices -- you can't merge two sets ;;; that appear on different sides of an element containing a live submatch. ;;; The problem is that the assignment of submatches breaks ties left-to-right. ;;; So these aren't the same: ;;; (| (submatch "x") any) (| any (submatch "x")) ;;; The first assigns the submatch, the second doesn't -- the ANY gets credit. ;;; We want to collapse multiple char-sets, bos's, and eos's, but we have ;;; to deal with this issue. So ;;; - When we coalesce anchors, we retain the *leftmost* one. ;;; - We coalesce sets that appear between live-submatch boundaries. ;;; When we do this, we subtract from the set any char that was in ;;; an earlier coalesced char-set. If this gets us down to the empty set, ;;; we drop it. If it gets us down to a singleton set, we convert it into ;;; a singleton string. ;;; Whew. I had to think about this one. (define (simp-choice re) (let ((tsm (re-choice:tsm re))) (receive (pre-dsm cset bos? eos? bol? eol? bow? eow? tail) (simp-choice1 (map simplify-regexp (re-choice:elts re))) (let ((tail (assemble-boundary-tail char-set:empty cset bos? eos? bol? eol? bow? eow? #f #f #f #f #f #f tail))) (values (if (pair? tail) (if (pair? (cdr tail)) (%make-re-choice tail (- tsm pre-dsm)) (car tail)) ; Singleton choice re-empty) ; Empty choice pre-dsm))))) ;;; Given the return values from simp-choice1, this tacks all ;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of ;;; TAIL. However, elements are not added onto TAIL that are already ;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto ;;; some earlier bit of the final result. (define (assemble-boundary-tail prev-cset cset bos? eos? bol? eol? bow? eow? prev-bos? prev-eos? prev-bol? prev-eol? prev-bow? prev-eow? tail) (let* ((cset (char-set-difference cset prev-cset)) (numchars (char-set-size cset)) (tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail)) (tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail)) (tail (if (and eow? (not prev-eow?)) (cons re-eow tail) tail)) (tail (if (and bow? (not prev-bow?)) (cons re-bow tail) tail)) (tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail)) (tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail)) (tail (? ((zero? numchars) tail) ; Drop empty char set. ((= 1 numchars) ; {c} => "c" (cons (make-re-string (string (car (char-set->list cset)))) tail)) (else (cons (make-re-char-set cset) tail))))) tail)) ;;; Simplify the non-empty list of choices ELTS. ;;; Return the result split out into the values ;;; [pre-dsm, cset, bos?, eos?, bol?, eol?, bow?, eow?, tail] (define (simp-choice1 elts) (let recur ((elts elts) (prev-cset char-set:empty) ; Chars we've already seen. (prev-bos? #f) (prev-eos? #f) ; These flags say if we've (prev-bol? #f) (prev-eol? #f) ; already seen one of these (prev-bow? #f) (prev-eow? #f)) ; anchors. (if (pair? elts) (let ((elt (car elts)) (elts (cdr elts))) (receive (elt pre-dsm) (open-dsm elt) (if (re-choice? elt) ;; Flatten nested choices. (let ((sub-elts (re-seq:elts elt))) (receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail) (recur (append sub-elts elts) prev-cset prev-bos? prev-eos? prev-bol? prev-eol? prev-bow? prev-eow?) (values (+ pre-dsm tail-pre-dsm) cset bos? eos? bol? eol? bow? eow? tail))) ;; Simplify the tail, then think about the head. (receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail) (recur elts (? ((and (re-string? elt) (= 1 (string-length (re-string:chars elt)))) (char-set-union prev-cset (string->char-set (re-string:chars elt)))) ;; The cset might be a Scheme exp. ((and (re-char-set? elt) (char-set? (re-char-set:cset elt))) (char-set-union prev-cset (re-char-set:cset elt))) (else prev-cset)) (or prev-bos? (re-bos? elt)) (or prev-eos? (re-eos? elt)) (or prev-bol? (re-bol? elt)) (or prev-eol? (re-eol? elt)) (or prev-bow? (re-bow? elt)) (or prev-eow? (re-eow? elt))) ;; This guy is called when we couldn't find any other ;; simplification. If ELT contains live submatches, then we ;; are at a merge boundary, and have to take all the ;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected ;; and tack them onto TAIL as elements, then put ELT on ;; front. Otherwise, we can commute TAIL-PRE-DSM, CSET, ;; BOS?, etc. with ELT, since it contains no live ;; submatches, so just tack ELT onto TAIL. (define (no-simp) (if (has-live-submatches? elt) (let ((tail (assemble-boundary-tail prev-cset cset bos? eos? bol? eol? bow? eow? prev-bos? prev-eos? prev-bol? prev-eol? prev-bow? prev-eow? tail))) (values pre-dsm char-set:empty #f #f #f #f #f #f (if (pair? tail) ;; Tack tail-pre-dsm onto ;; TAIL's first elt. (cons elt (cons (re-dsm (car tail) tail-pre-dsm 0) (cdr tail))) ;; Squirrel case: TAIL is empty, so use ;; TAIL-PRE-DSM as ELT's post-dsm. (list (re-dsm elt 0 tail-pre-dsm))))) ;; ELT has no live submatches, so we can commute all ;; the recursion state forwards past it. (values (+ pre-dsm tail-pre-dsm) cset bos? eos? bol? eol? bow? eow? (cons elt tail)))) (? ((and (re-char-set? elt) (char-set? (re-char-set:cset elt))) ; Might be Scheme code (values (+ pre-dsm tail-pre-dsm) (char-set-union cset (re-char-set:cset elt)) bos? eos? bol? eol? bow? eow? tail)) ;; Treat a singleton string "c" as a singleton set {c}. ((and (re-string? elt) (= 1 (string-length (re-string:chars elt)))) (values (+ pre-dsm tail-pre-dsm) (char-set-union cset (string->char-set (re-string:chars elt))) bos? eos? bol? eol? bow? eow? tail)) ;; Coalesce bol/eol/bos/eos/bow/eow's. ((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset #t eos? bol? eol? bow? eow? tail)) ((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset bos? #t bol? eol? bow? eow? tail)) ((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset bos? eos? #t eol? bow? eow? tail)) ((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset bos? eos? bol? #t bow? eow? tail)) ((re-bow? elt) (values (+ pre-dsm tail-pre-dsm) cset bos? eos? bol? eol? #t eow? tail)) ((re-eow? elt) (values (+ pre-dsm tail-pre-dsm) cset bos? eos? bol? eol? bow? #t tail)) (else (no-simp))))))) (values 0 char-set:empty #f #f #f #f #f #f '())))) (define (simp-repeat re) (let ((from (re-repeat:from re)) (to (re-repeat:to re)) (body (re-repeat:body re))) (receive (simp-body pre-dsm) (simp-re body) ; Simplify body. ;; The fancy reductions are all handled by REDUCE-REPEAT. (reduce-repeat from to simp-body pre-dsm)))) ;;; Does RE contain a live submatch? ;;; If RE is dynamic, we can't tell, so we err conservatively, ;;; which means we say "yes." (define (has-live-submatches? re) (or (re-submatch? re) (? ((re-seq? re) (every has-live-submatches? (re-seq:elts re))) ((re-choice? re) (every has-live-submatches? (re-choice:elts re))) ((re-repeat? re) (has-live-submatches? (re-repeat:body re))) ((re-dsm? re) (has-live-submatches? (re-dsm:body re))) ;; If it's not one of these things, then this isn't a regexp -- it's ;; a chunk of Scheme code producing a regexp, and we conservatively ;; return #T -- the expression *might* produce a regexp containing ;; a live submatch: (else (not (or (re-char-set? re) (re-string? re) (re-bos? re) (re-eos? re) (re-bol? re) (re-eol? re) (re-bow? re) (re-eow? re)))))))