From 03b9afa710658ebb209999abc319e1c4b1eef306 Mon Sep 17 00:00:00 2001 From: Charles Iliya Krempeaux Date: Mon, 2 Jul 2018 01:27:49 -0700 Subject: [PATCH] utf8s.ReadRune() --- readrune.go | 158 +++++++++++++++++++++++ readrune_test.go | 320 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 478 insertions(+) create mode 100644 readrune.go create mode 100644 readrune_test.go diff --git a/readrune.go b/readrune.go new file mode 100644 index 0000000..afc2aa4 --- /dev/null +++ b/readrune.go @@ -0,0 +1,158 @@ +package utf8s + +import ( + "io" +) + +// ReadRune reads a single UTF-8 encoded Unicode character from an io.Reader, +// and returns the Unicode character (as a Go rune) and the number of bytes read. +func ReadRune(reader io.Reader) (rune, int, error) { + if nil == reader { + return 0, 0, errNilReader + } + + var count int + + var b0 byte + { + var buffer [1]byte + var p []byte = buffer[:] + + n, err := reader.Read(p) + count += n + if nil != err { + return 0, count, err + } + if 1 != n { + return 0, count, errInternalError + } + + b0 = buffer[0] + } + + if 127 >= b0 { + return rune(b0), count, nil + } + + var more int + { + switch { + + // 110x,xxxx 110x,xxxx + // 0b1100,0000 == (0b1110,0000 & b0) + case 0xC0 == (0xE0 & b0): + more = 2-1 + + // 1110,xxxx 1110,xxxx + // 0b1110,0000 == (0b1111,0000 & b0) + case 0xE0 == (0xF0 & b0): + more = 3-1 + + // 1111,0xxx 1111,0xxx + // 0b1111,0000 == (0b1111,1000 & b0) + case 0xF0 == (0xF8 & b0): + more = 4-1 + + // 1111,10xx 1111,10xx + // 0b1111,1000 == (0b1111,1100 & b0) + case 0xF8 == (0xFC & b0): + more = 5-1 + + // 1111,110x 1111,110x + // 0b1111,1100 == (0b1111,1110 & b0) + case 0xFC == (0xFE & b0): + more = 6-1 + + // 1111,1111 1111,1111 + // 0b1111,1110 == (0b1111,1111 & b0) + case 0xFE == (0xFF & b0): + more = 7-1 + + default: + return 0, count, errInternalError + } + } + + + var bs [6]byte + { + p := bs[:more] + + n, err := reader.Read(p) + count += n + if nil != err { + return 0, count, err + } + if more != n { + return 0, count, errInternalError + } + } + + var r rune + { + + var b byte + + switch { + + // 110x,xxxx 110x,xxxx + // 0b1100,0000 == (0b1110,0000 & b0) + case 0xC0 == (0xE0 & b0): + b = (0xE0^0xFF) & b0 + + // 1110,xxxx 1110,xxxx + // 0b1110,0000 == (0b1111,0000 & b0) + case 0xE0 == (0xF0 & b0): + b = (0xF0^0xFF) & b0 + + // 1111,0xxx 1111,0xxx + // 0b1111,0000 == (0b1111,1000 & b0) + case 0xF0 == (0xF8 & b0): + b = (0xF8^0xFF) & b0 + + // 1111,10xx 1111,10xx + // 0b1111,1000 == (0b1111,1100 & b0) + case 0xF8 == (0xFC & b0): + b = (0xFC^0xFF) & b0 + + // 1111,110x 1111,110x + // 0b1111,1100 == (0b1111,1110 & b0) + case 0xFC == (0xFE & b0): + b = (0xFE^0xFF) & b0 + + // 1111,1111 1111,1111 + // 0b1111,1110 == (0b1111,1111 & b0) + case 0xFE == (0xFF & b0): + //b := (0xFF^0xFF) & b0 + + default: + return 0, count, errInternalError + } + + r = rune(b) + r <<= 6 + + for i:=0; i