{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses #-}
+moduleData.Monoid.Lexical.UTF8.Decoder
+(moduleData.Monoid.Reducer.Char
+,UTF8
+,runUTF8
+)where
+
+importData.Bits(shiftL,(.&.),(.|.))
+importData.Word(Word8)
+
+importControl.Functor.Pointed
+
+importData.Monoid.Reducer.Char
+
+-- Incrementally reduce canonical RFC3629 UTF-8 Characters
+
+-- utf8 characters are at most 4 characters long, so we need only retain state for 3 of them
+-- moreover their length is able to be determined a priori, so lets store that intrinsically in the constructor
+dataH=H0
+|H2_1{-# UNPACK #-}!Word8
+|H3_1{-# UNPACK #-}!Word8
+|H3_2{-# UNPACK #-}!Word8!Word8
+|H4_1{-# UNPACK #-}!Word8
+|H4_2{-# UNPACK #-}!Word8!Word8
+|H4_3{-# UNPACK #-}!Word8!Word8!Word8
+
+-- words expressing the tail of a character, each between 0x80 and 0xbf
+-- this is arbitrary length to simplify making the parser truly monoidal
+-- this probably means we have O(n^2) worst case performance in the face of very long runs of chars that look like 10xxxxxx
+typeT=[Word8]
+
+-- S is a segment that contains a possible tail of a character, the result of reduceing some full characters, and the start of another character
+-- T contains a list of bytes each between 0x80 and 0xbf
+dataUTF8m=STm!H
+|TT
+
+-- flush any extra characters in a head, when the next character isn't between 0x80 and 0xbf
+flushH::CharReducerm=>H->m
+flushH(H0)=mempty
+flushH(H2_1x)=invalidChar[x]
+flushH(H3_1x)=invalidChar[x]
+flushH(H3_2xy)=invalidChar[x,y]
+flushH(H4_1x)=invalidChar[x]
+flushH(H4_2xy)=invalidChar[x,y]
+flushH(H4_3xyz)=invalidChar[x,y,z]
+
+-- flush a character tail
+flushT::CharReducerm=>[Word8]->m
+flushT=invalidChar
+
+reducerH::CharReducerm=>H->Word8->(m->H->UTF8m)->m->UTF8m
+reducerHH0ckm
+|c<0x80=k(m`mappend`b1c)H0
+|c<0xc0=k(m`mappend`invalidChar[c])H0
+|c<0xe0=km(H2_1c)
+|c<0xf0=km(H3_1c)
+|c<0xf5=km(H4_1c)
+|otherwise=k(m`mappend`invalidChar[c])H0
+reducerH(H2_1c)dkm
+|d>=0x80&&d<0xc0=k(m`mappend`b2cd)H0
+|otherwise=k(m`mappend`invalidChar[c])H0
+reducerH(H3_1c)dkm
+|d>=0x80&&d<0xc0=km(H3_2cd)
+|otherwise=k(m`mappend`invalidChar[c])H0
+reducerH(H3_2cd)ekm
+|d>=0x80&&d<0xc0=k(m`mappend`b3cde)H0
+|otherwise=k(m`mappend`invalidChar[c,d])H0
+reducerH(H4_1c)dkm
+|d>=0x80&&d<0xc0=km(H4_2cd)
+|otherwise=k(m`mappend`invalidChar[c,d])H0
+reducerH(H4_2cd)ekm
+|d>=0x80&&d<0xc0=km(H4_3cde)
+|otherwise=k(m`mappend`invalidChar[c,d,e])H0
+reducerH(H4_3cde)fkm
+|d>=0x80&&d<0xc0=k(m`mappend`b4cdef)H0
+|otherwise=k(m`mappend`invalidChar[c,d,e,f])H0
+
+mask::Word8->Word8->Int
+maskcm=fromEnum(c.&.m)
+
+combine::Int->Word8->Int
+combinear=shiftLa6.|.fromEnum(r.&.0x3f)
+
+b1::CharReducerm=>Word8->m
+b1c|c<0x80=reduceChar.toEnum$fromEnumc
+|otherwise=invalidChar[c]
+
+b2::CharReducerm=>Word8->Word8->m
+b2cd|valid_b2cd=reduceChar(toEnum(combine(maskc0x1f)d))
+|otherwise=invalidChar[c,d]
+
+b3::CharReducerm=>Word8->Word8->Word8->m
+b3cde|valid_b3cde=reduceChar(toEnum(combine(combine(maskc0x0f)d)e))
+|otherwise=invalidChar[c,d,e]
+
+
+b4::CharReducerm=>Word8->Word8->Word8->Word8->m
+b4cdef|valid_b4cdef=reduceChar(toEnum(combine(combine(combine(maskc0x07)d)e)f))
+|otherwise=invalidChar[c,d,e,f]
+
+valid_b2::Word8->Word8->Bool
+valid_b2cd=(c>=0xc2&&c<=0xdf&&d>=0x80&&d<=0xbf)
+
+valid_b3::Word8->Word8->Word8->Bool
+valid_b3cde=(c==0xe0&&d>=0xa0&&d<=0xbf&&e>=0x80&&e<=0xbf)||
+(c>=0xe1&&c<=0xef&&d>=0x80&&d<=0xbf&&e>=0x80&&e<=0xbf)
+
+valid_b4::Word8->Word8->Word8->Word8->Bool
+valid_b4cdef=(c==0xf0&&d>=0x90&&d<=0xbf&&e>=0x80&&e<=0xbf&&f>=0x80&&f<=0xbf)||
+(c>=0xf1&&c<=0xf3&&d>=0x80&&d<=0xbf&&e>=0x80&&e<=0xbf&&f>=0x80&&f<=0xbf)||
+(c==0xf4&&d>=0x80&&d<=0x8f&&e>=0x80&&e<=0xbf&&f>=0x80&&f<=0xbf)
+
+reducelT::CharReducerm=>Word8->T->(H->UTF8m)->(m->UTF8m)->(T->UTF8m)->UTF8m
+reducelTccshmt
+|c<0x80=m$b1c`mappend`invalidCharscs
+|c<0xc0=t(c:cs)
+|c<0xe0=casecsof
+[]->h$H2_1c
+(d:ds)->m$b2cd`mappend`invalidCharsds
+|c<0xf0=casecsof
+[]->h$H3_1c
+[d]->h$H3_2cd
+(d:e:es)->m$b3cde`mappend`invalidCharses
+|c<0xf5=casecsof
+[]->h$H4_1c
+[d]->h$H4_2cd
+[d,e]->h$H4_3cde
+(d:e:f:fs)->m$b4cdef`mappend`invalidCharsfs
+|otherwise=mempty
+
+invalidChars::CharReducerm=>[Word8]->m
+invalidChars=foldr(mappend.invalidChar.return)mempty
+
+merge::CharReducerm=>H->T->(m->a)->(H->a)->a
+mergeH0csk_=k$invalidCharscs
+merge(H2_1c)[]_p=p$H2_1c
+merge(H2_1c)(d:ds)k_=k$b2cd`mappend`invalidCharsds
+merge(H3_1c)[]_p=p$H3_1c
+merge(H3_1c)[d]_p=p$H3_2cd
+merge(H3_1c)(d:e:es)k_=k$b3cde`mappend`invalidCharses
+merge(H3_2cd)[]_p=p$H3_2cd
+merge(H3_2cd)(e:es)k_=k$b3cde`mappend`invalidCharses
+merge(H4_1c)[]_p=p$H4_1c
+merge(H4_1c)[d]_p=p$H4_2cd
+merge(H4_1c)[d,e]_p=p$H4_3cde
+merge(H4_1c)(d:e:f:fs)k_=k$b4cdef`mappend`invalidCharsfs
+merge(H4_2cd)[]_p=p$H4_2cd
+merge(H4_2cd)[e]_p=p$H4_3cde
+merge(H4_2cd)(e:f:fs)k_=k$b4cdef`mappend`invalidCharsfs
+merge(H4_3cde)[]_p=p$H4_3cde
+merge(H4_3cde)(f:fs)k_=k$b4cdef`mappend`invalidCharsfs
+
+instanceCharReducerm=>Monoid(UTF8m)where
+mempty=T[]
+Tc`mappend`Td=T(c++d)
+Tc`mappend`Slmr=S(c++l)mr
+Slmc`mappend`Sc'm'r=Sl(m`mappend`mergecc'idflushH`mappend`m')r
+s@(S___)`mappend`T[]=s
+Slmc`mappend`Tc'=mergecc'k(Slm)where
+km'=Sl(m`mappend`m')H0
+
+instanceCharReducerm=>ReducerWord8(UTF8m)where
+Stmh`reducer`c=reducerHhc(St)m
+Tt`reducer`c|c>=0x80&&c<0xc0=T(t++[c])
+|otherwise=reducerHH0c(St)mempty
+
+c`reducel`Tcs=reducelTccs(S[]mempty)(flip(S[])H0)T
+c`reducel`Scsmh=reducelTccsk1k2k3where
+k1h'=S[](flushHh'`mappend`m)h
+k2m'=S[](m'`mappend`m)h
+k3t'=St'mh
+
+instanceFunctorUTF8where
+fmapf(Stxh)=St(fx)h
+fmap_(Tt)=Tt
+
+instancePointedUTF8where
+pointf=S[]fH0
+
+runUTF8::CharReducerm=>UTF8m->m
+runUTF8(Tt)=flushTt
+runUTF8(Stmh)=flushTt`mappend`m`mappend`flushHh
+