Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: implementing kanji encoding mode #108

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
feat: support kanji encoding
yeqown committed Jun 9, 2024
commit c5bb84e0d0f79f9c83275f0f49a4067d278ef728
70 changes: 55 additions & 15 deletions chardet.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package qrcode

import (
"log"
)

// chardet.go refer to https://github.com/chardet/chardet to detect input string's
// character set, to see any unsupported character encountered in the input string.

@@ -16,35 +20,53 @@
// case3: could not use EncModeAlphanumeric, but you can find all of them in ISO-8859-1 character set, use EncModeByte.
// case4: could not use EncModeByte, use EncModeJP, no more choice.
func analyzeEncodeModeFromRaw(raw string) encMode {
analyzeFnMapping := map[encMode]analyzeEncFunc{
EncModeNumeric: analyzeNum,
EncModeAlphanumeric: analyzeAlphaNum,
EncModeByte: analyzeByte,
EncModeJP: nil,
}

var (
analyzeFn analyzeEncFunc
mode = EncModeNumeric
mode = EncModeNone
)

// loop to check each character in raw data,
// from low mode to higher while current mode could bearing the input data.
for _, byt := range raw {
reAnalyze:
if analyzeFn = analyzeFnMapping[mode]; analyzeFn == nil {
break
getNextAnalyzeFn := func() analyzeEncFunc {
switch mode {
case EncModeNumeric:
return analyzeNum
case EncModeAlphanumeric:
return analyzeAlphaNum
case EncModeByte:
return analyzeByte
case EncModeJP:
return analyzeJP
default:
}

return analyzeDefault
}

next := func() {
// switch to next mode and get next analyze function.
mode <<= 1
analyzeFn = getNextAnalyzeFn()
}

next()

// Loop to check each character in raw data,
// from low mode to higher while current mode could bear the input data.
for _, byt := range raw {
reAnalyze:
// issue#28 @borislavone reports this bug.
// FIXED(@yeqown): next encMode analyzeVersionAuto func did not check the previous byte,
// add goto statement to reanalyze previous byte which can't be analyzed in last encMode.
if !analyzeFn(byt) {
mode <<= 1
next()
goto reAnalyze
}
}

if mode > EncModeJP {
// If the mode overflow the EncModeJP, means we can't encode the input data.
log.Panicf("could not encode the input data: %s", raw)
}

return mode
}

@@ -68,9 +90,27 @@
// analyzeByte contains ISO-8859-1 character set
func analyzeByte(r rune) bool {
// ISO-8859-1 character set, if r > \u00ff, means it's not in ISO-8859-1.
if r > '\u00ff' {

Check failure on line 93 in chardet.go

GitHub Actions / lint

S1008: should use 'return r <= '\u00ff'' instead of 'if r > '\u00ff' { return false }; return true' (gosimple)
return false
}

return true
}

// analyzeJP contains Kanji character set
// http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
func analyzeJP(r rune) bool {
// Kanji character set
if r > 0x8140 && r < 0x9FFC {
return true
}
if r > 0xE040 && r < 0xEBBF {
return true
}

return false
}

func analyzeDefault(r rune) bool {
return false
}
68 changes: 64 additions & 4 deletions encoder.go
Original file line number Diff line number Diff line change
@@ -5,9 +5,11 @@ package qrcode
import (
"fmt"
"log"
"strconv"

"github.com/yeqown/reedsolomon/binary"
"strconv"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
)

// encMode indicates the encoding mode of the data to be encoded.
@@ -120,7 +122,7 @@ func (e *encoder) Encode(raw string) (*binary.Binary, error) {
case EncModeNumeric, EncModeAlphanumeric, EncModeByte:
data = []byte(raw)
case EncModeJP:
// TODO: construct data []byte from raw string
data = toShiftJIS(raw)
default:
log.Printf("unsupported encoding mode: %s", getEncModeName(e.mode))
}
@@ -207,10 +209,68 @@ func (e *encoder) encodeByte(data []byte) {
}
}

// encodeKanji
// toShiftJIS
// https://www.thonky.com/qr-code-tutorial/kanji-mode-encoding
func toShiftJIS(raw string) []byte {
// FIXME: some character encoded into Shift JIS but not in the range of 0x8140-0x9FFC and 0xE040-0xEBBF.
enc := japanese.ShiftJIS.NewEncoder()
s2, _, err := transform.String(enc, raw)
if err != nil {
log.Printf("could not encode string to Shift JIS: %v", err)
return []byte{}
}

data := []byte(s2)
if len(data)%2 != 0 {
log.Panicf("shift JIS encoded []byte must be times of 2, but got %d", len(data))
}

for i := 0; i < len(data); i += 2 {
data[i], data[i+1] = encodeShiftJIS(data[i], data[i+1])
}

return data
}

func encodeShiftJIS(hi byte, lo byte) (byte, byte) {
r := uint16(hi)<<8 | uint16(lo)

fmt.Printf("before: r=%x\n", r)
if r > 0x8140 && r < 0x9FFC {
r -= 0x8140
} else if r > 0xE040 && r < 0xEBBF {
r -= 0xC140
} else {
// Not a Shift JIS character out of range 0x8140-0x9FFC and 0xE040-0xEBBF
log.Printf("'%c'(0x%x) not a Shift JIS character out of range 0x8140-0x9FFC and 0xE040-0xEBBF", r, r)
return 0, 0
}

fmt.Printf("middle: r=%x\n", r)
hi = uint8(r >> 8)
lo = uint8(r & 0xFF)

fmt.Printf("middle: high=%x, low=%x\n", hi, lo)

r = uint16(hi)*uint16(0xC0) + uint16(lo)
fmt.Printf("after: r=%x\n", r)

return byte(r >> 8), byte(r & 0xFF)
}

// encodeKanji
func (e *encoder) encodeKanji(data []byte) {
// TODO: implement encodeKanji
// data must be times of 2, since toShiftJIS encode 1 char to 2 bytes
if len(data)%2 != 0 {
log.Println("data must be times of 2")
}

for i := 0; i < len(data); i += 2 {
// 2 bytes to 1 kanji
// 2 bytes to 13 bits
_ = e.dst.AppendByte(data[i]<<3, 5)
_ = e.dst.AppendByte(data[i+1], 8)
}
}

// Break Up into 8-bit Codewords and Add Pad Bytes if Necessary
26 changes: 26 additions & 0 deletions encoder_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package qrcode

import (
"bytes"
"testing"
)

@@ -48,3 +49,28 @@ func TestEncodeByte(t *testing.T) {
}
t.Log(b, b.Len())
}

func Test_toShiftJIS(t *testing.T) {
type args struct {
s string
}
tests := []struct {
name string
args args
want []byte
}{
{
name: "test 1",
args: args{"茗荷"},
want: []byte{0x1A, 0xAA, 0x06, 0x97},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := toShiftJIS(tt.args.s); !bytes.Equal(got, tt.want) {
t.Errorf("toShiftJIS() = %v, want %v", got, tt.want)
}
})
}
}