feat: support kanji encoding

yeqown · yeqown · May 18, 2024 · May 18, 2024 · Jun 9, 2024 · Jun 9, 2024
commit c5bb84e0d0f79f9c83275f0f49a4067d278ef728
diff --git a/chardet.go b/chardet.go
@@ -1,5 +1,9 @@
 package qrcode
 
+import (
+	"log"
+)
+
 // chardet.go refer to https://github.com/chardet/chardet to detect input string's
 // character set, to see any unsupported character encountered in the input string.
 
@@ -16,35 +20,53 @@
 // case3: could not use EncModeAlphanumeric, but you can find all of them in ISO-8859-1 character set, use EncModeByte.
 // case4: could not use EncModeByte, use EncModeJP, no more choice.
 func analyzeEncodeModeFromRaw(raw string) encMode {
-	analyzeFnMapping := map[encMode]analyzeEncFunc{
-		EncModeNumeric:      analyzeNum,
-		EncModeAlphanumeric: analyzeAlphaNum,
-		EncModeByte:         analyzeByte,
-		EncModeJP:           nil,
-	}
-
 	var (
 		analyzeFn analyzeEncFunc
-		mode      = EncModeNumeric
+		mode      = EncModeNone
 	)
 
-	// loop to check each character in raw data,
-	// from low mode to higher while current mode could bearing the input data.
-	for _, byt := range raw {
-	reAnalyze:
-		if analyzeFn = analyzeFnMapping[mode]; analyzeFn == nil {
-			break
+	getNextAnalyzeFn := func() analyzeEncFunc {
+		switch mode {
+		case EncModeNumeric:
+			return analyzeNum
+		case EncModeAlphanumeric:
+			return analyzeAlphaNum
+		case EncModeByte:
+			return analyzeByte
+		case EncModeJP:
+			return analyzeJP
+		default:
 		}
 
+		return analyzeDefault
+	}
+
+	next := func() {
+		// switch to next mode and get next analyze function.
+		mode <<= 1
+		analyzeFn = getNextAnalyzeFn()
+	}
+
+	next()
+
+	// Loop to check each character in raw data,
+	// from low mode to higher while current mode could bear the input data.
+	for _, byt := range raw {
+	reAnalyze:
 		// issue#28 @borislavone reports this bug.
 		// FIXED(@yeqown): next encMode analyzeVersionAuto func did not check the previous byte,
 		// add goto statement to reanalyze previous byte which can't be analyzed in last encMode.
 		if !analyzeFn(byt) {
-			mode <<= 1
+			next()
 			goto reAnalyze
 		}
 	}
 
+	if mode > EncModeJP {
+		// If the mode overflow the EncModeJP, means we can't encode the input data.
+		log.Panicf("could not encode the input data: %s", raw)
+	}
+
 	return mode
 }
 
@@ -68,9 +90,27 @@
 // analyzeByte contains ISO-8859-1 character set
 func analyzeByte(r rune) bool {
 	// ISO-8859-1 character set, if r > \u00ff, means it's not in ISO-8859-1.
 	if r > '\u00ff' {
 		return false
 	}
 
 	return true
 }
+
+// analyzeJP contains Kanji character set
+// http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
+func analyzeJP(r rune) bool {
+	// Kanji character set
+	if r > 0x8140 && r < 0x9FFC {
+		return true
+	}
+	if r > 0xE040 && r < 0xEBBF {
+		return true
+	}
+
+	return false
+}
+
+func analyzeDefault(r rune) bool {
+	return false
+}
diff --git a/encoder.go b/encoder.go
@@ -5,9 +5,11 @@ package qrcode
 import (
 	"fmt"
 	"log"
+	"strconv"
 
 	"github.com/yeqown/reedsolomon/binary"
-	"strconv"
+	"golang.org/x/text/encoding/japanese"
+	"golang.org/x/text/transform"
 )
 
 // encMode indicates the encoding mode of the data to be encoded.
@@ -120,7 +122,7 @@ func (e *encoder) Encode(raw string) (*binary.Binary, error) {
 	case EncModeNumeric, EncModeAlphanumeric, EncModeByte:
 		data = []byte(raw)
 	case EncModeJP:
-		// TODO: construct data []byte from raw string
+		data = toShiftJIS(raw)
 	default:
 		log.Printf("unsupported encoding mode: %s", getEncModeName(e.mode))
 	}
@@ -207,10 +209,68 @@ func (e *encoder) encodeByte(data []byte) {
 	}
 }
 
-// encodeKanji
+// toShiftJIS
 // https://www.thonky.com/qr-code-tutorial/kanji-mode-encoding
+func toShiftJIS(raw string) []byte {
+	// FIXME: some character encoded into Shift JIS but not in the range of 0x8140-0x9FFC and 0xE040-0xEBBF.
+	enc := japanese.ShiftJIS.NewEncoder()
+	s2, _, err := transform.String(enc, raw)
+	if err != nil {
+		log.Printf("could not encode string to Shift JIS: %v", err)
+		return []byte{}
+	}
+
+	data := []byte(s2)
+	if len(data)%2 != 0 {
+		log.Panicf("shift JIS encoded []byte must be times of 2, but got %d", len(data))
+	}
+
+	for i := 0; i < len(data); i += 2 {
+		data[i], data[i+1] = encodeShiftJIS(data[i], data[i+1])
+	}
+
+	return data
+}
+
+func encodeShiftJIS(hi byte, lo byte) (byte, byte) {
+	r := uint16(hi)<<8 | uint16(lo)
+
+	fmt.Printf("before: r=%x\n", r)
+	if r > 0x8140 && r < 0x9FFC {
+		r -= 0x8140
+	} else if r > 0xE040 && r < 0xEBBF {
+		r -= 0xC140
+	} else {
+		// Not a Shift JIS character out of range 0x8140-0x9FFC and 0xE040-0xEBBF
+		log.Printf("'%c'(0x%x) not a Shift JIS character out of range 0x8140-0x9FFC and 0xE040-0xEBBF", r, r)
+		return 0, 0
+	}
+
+	fmt.Printf("middle: r=%x\n", r)
+	hi = uint8(r >> 8)
+	lo = uint8(r & 0xFF)
+
+	fmt.Printf("middle: high=%x, low=%x\n", hi, lo)
+
+	r = uint16(hi)*uint16(0xC0) + uint16(lo)
+	fmt.Printf("after: r=%x\n", r)
+
+	return byte(r >> 8), byte(r & 0xFF)
+}
+
+// encodeKanji
 func (e *encoder) encodeKanji(data []byte) {
-	// TODO: implement encodeKanji
+	// data must be times of 2, since toShiftJIS encode 1 char to 2 bytes
+	if len(data)%2 != 0 {
+		log.Println("data must be times of 2")
+	}
+
+	for i := 0; i < len(data); i += 2 {
+		// 2 bytes to 1 kanji
+		// 2 bytes to 13 bits
+		_ = e.dst.AppendByte(data[i]<<3, 5)
+		_ = e.dst.AppendByte(data[i+1], 8)
+	}
 }
 
 // Break Up into 8-bit Codewords and Add Pad Bytes if Necessary

diff --git a/encoder_test.go b/encoder_test.go
@@ -1,6 +1,7 @@
 package qrcode
 
 import (
+	"bytes"
 	"testing"
 )
 
@@ -48,3 +49,28 @@ func TestEncodeByte(t *testing.T) {
 	}
 	t.Log(b, b.Len())
 }
+
+func Test_toShiftJIS(t *testing.T) {
+	type args struct {
+		s string
+	}
+	tests := []struct {
+		name string
+		args args
+		want []byte
+	}{
+		{
+			name: "test 1",
+			args: args{"茗荷"},
+			want: []byte{0x1A, 0xAA, 0x06, 0x97},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := toShiftJIS(tt.args.s); !bytes.Equal(got, tt.want) {
+				t.Errorf("toShiftJIS() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}