|
| 1 | + |
| 2 | +module Reducer (reduce,toDot) where |
| 3 | + |
| 4 | +import Data.List |
| 5 | + |
| 6 | +-- n-grams |
| 7 | + |
| 8 | +ngramsSet 0 _ = [] |
| 9 | +ngramsSet _ [] = [] |
| 10 | +ngramsSet n xs |
| 11 | + | length ngram == n = ngram : ngramsSet n (tail xs) |
| 12 | + | otherwise = [] |
| 13 | + where |
| 14 | + ngram = take n xs |
| 15 | + |
| 16 | +ngramsCount l = [ (x,length(filter (==x) l)) | x <- nub l ] |
| 17 | + |
| 18 | +ngrams n l = ngramsCount $ ngramsSet n l |
| 19 | + |
| 20 | +allGrams l = [ (n, ngrams n l) | n <- reverse [2 .. div (length l) 2] ] |
| 21 | + |
| 22 | +filtGrams l = filter ( (>1).snd ) l -- remove n-grams with occurences == 1 |
| 23 | + |
| 24 | +-- Replace sub lists of tokens in a list |
| 25 | + |
| 26 | +replace _ _ [] = [] |
| 27 | +replace pat sub l = if (pat == take (length pat) l) then |
| 28 | + sub : replace pat sub (drop (length pat) l) |
| 29 | + else |
| 30 | + (head l) : replace pat sub (tail l) |
| 31 | + |
| 32 | +-- Count the number of replace sub lists of tokens in a list |
| 33 | + |
| 34 | +replaceCount _ [] = 0 |
| 35 | +replaceCount pat l |
| 36 | + | pat == take (length pat) l = 1 + replaceCount pat (drop (length pat) l) |
| 37 | + | otherwise = 0 + replaceCount pat (tail l) |
| 38 | + |
| 39 | +-- main reduce function |
| 40 | + |
| 41 | +reduce l = reduceAux l [] |
| 42 | + |
| 43 | +reduceAux l ps = let a = allGrams l |
| 44 | + f = [ (fst i, filtGrams (snd i)) | i <- a ] |
| 45 | + f2 = filter ( (/=[]).snd ) f |
| 46 | + flat = map fst (concat (map snd f2)) |
| 47 | + (x, y) = replaceAll l flat 0 [] |
| 48 | + in |
| 49 | + if y == [] then (x, ps) |
| 50 | + else reduceAux x (ps++y) |
| 51 | + |
| 52 | +replaceAll l [] _ ps = (l, ps) |
| 53 | +replaceAll l (x:xs) c ps = |
| 54 | + let cnt = replaceCount x l |
| 55 | + pid = nextId c |
| 56 | + in |
| 57 | + if cnt>1 then replaceAll (replace x pid l) xs (c+1) (ps++[(pid, x)]) |
| 58 | + else replaceAll l xs c ps |
| 59 | + |
| 60 | +nextId c = "P" ++ show c -- create next pattern id |
| 61 | + |
| 62 | +-- debugging |
| 63 | + |
| 64 | +getP p [] = [] |
| 65 | +getP p (x:xs) |
| 66 | + | p == (fst x) = snd x |
| 67 | + | otherwise = getP p xs |
| 68 | + |
| 69 | +toDot (expr,_) = |
| 70 | + let header = ["digraph auto {","rankdir=LR;", "size=\"7,5\";", "node [shape = circle];"] |
| 71 | + edges = [ toEdge e | e <- zip [1..] (ngramsSet 2 expr) ] |
| 72 | + in unlines $ concat [header,edges,["}"]] |
| 73 | + |
| 74 | +toEdge (n,[f,t]) = concat ["\"",f,"\" -> \"",t,"\" [ label =\"",show n,"\" ];"] |
| 75 | + |
0 commit comments