Fix STRING encoding/decoding

STRING is text-based opcode which is used at protocol 0 and follows \-escaped argument till EOL. - for encoding we must not use Go's %q, since that will use \u and \U when seeing corresponding bytes, and since Python does not interpret \u or \U in string literals, the data received at Python side will be different. - for decoding we must explicitly implement Python's 'string-escape' codec decoding which is used by Python's pickle for STRING opcode argument. Updates: https://github.com/kisielk/og-rek/issues/48

Fix STRING encoding/decoding
STRING is text-based opcode which is used at protocol 0 and follows \-escaped argument till EOL. - for encoding we must not use Go's %q, since that will use \u and \U when seeing corresponding bytes, and since Python does not interpret \u or \U in string literals, the data received at Python side will be different. - for decoding we must explicitly implement Python's 'string-escape' codec decoding which is used by Python's pickle for STRING opcode argument. Updates: https://github.com/kisielk/og-rek/issues/48
f62fe97f · Kirill Smelkov · Kamil Kisiel · 18004fbd · f62fe97f · f62fe97f
Commit f62fe97f authored Sep 25, 2018 by Kirill Smelkov Committed by Kamil Kisiel Sep 25, 2018
Show whitespace changes
Inline Side-by-side

Showing with 146 additions and 8 deletions

encode.go encode.go +4 -1

ogorek.go ogorek.go +6 -6

ogorek_test.go ogorek_test.go +10 -0

pyquote.go pyquote.go +85 -1

pyquote_test.go pyquote_test.go +41 -0

No files found.
--- a/encode.go
+++ b/encode.go
@@ -295,7 +295,10 @@ func (e *Encoder) encodeString(s string) error {
 	// protocol 0: STRING
 	// XXX Python uses both ' and " for quoting - we quote with " only.
 	// XXX -> use https://godoc.org/lab.nexedi.com/kirr/go123/xfmt#AppendQuotePy ?
-	return e.emitf("%c%q\n", opString, s)
+	//
+	// don't use %q - that will use \u and \U in quoting which python won't
+	// interpret when decoding string literals.
+	return e.emitf("%c%s\n", opString, pyquote(s))
 }
 // encodeUnicode emits UTF-8 encoded string s as unicode pickle object.

--- a/ogorek.go
+++ b/ogorek.go
@@ -636,11 +636,6 @@ func (d *Decoder) reduce() error {
 	return nil
 }
-func decodeStringEscape(b []byte) string {
-	// TODO
-	return string(b)
-}
 // Push a string
 func (d *Decoder) loadString() error {
 	line, err := d.readLine()
@@ -666,7 +661,12 @@ func (d *Decoder) loadString() error {
 		return io.ErrUnexpectedEOF
 	}
-	d.push(decodeStringEscape(line[1 : len(line)-1]))
+	s, err := pydecodeStringEscape(string(line[1 : len(line)-1]))
+	if err != nil {
+		return err
+	}
+	d.push(s)
 	return nil
 }

--- a/ogorek_test.go
+++ b/ogorek_test.go
@@ -261,6 +261,16 @@ var tests = []TestEntry{
 		// TODO BINUNICODE8
+	// str with many control characters at P0
+	// this exercises escape-based STRING coding
+	X(`str('\x80ми\nр\r\u2028\\u1234\\U00004321') # text escape`, "\x80ми\nр\r\u2028\\u1234\\U00004321",
+		P0("S\"\\x80ми\\nр\\r\\xe2\\x80\\xa8\\\\u1234\\\\U00004321\"\n."),
+		I("S\"\\x80ми\\nр\\r\\xe2\\x80\\xa8\\u1234\\U00004321\"\n.")), // \u and \U not decoded
+	X(`str("hel'lo")`, "hel'lo", I("S'hel'lo'\n.")),      // non-escaped ' inside '-quotes
+	X(`str("hel\"lo")`, "hel\"lo", I("S\"hel\"lo\"\n.")), // non-escaped " inside "-quotes
 	X("dict({})", make(map[interface{}]interface{}),
 		P0("(d."), // MARK + DICT

--- a/pyquote.go
+++ b/pyquote.go
 package ogórek
 import (
+	"fmt"
 	"strconv"
 	"unicode/utf8"
 )
@@ -10,8 +11,11 @@ import (
 // We need to avoid \u and friends, since for regular strings Python translates
 // \u to \\u, not an UTF-8 character.
 //
+// We must use Python - not Go - quoting, when emitting text strings with
+// STRING opcode.
+//
 // Dumping strings in a way that is possible to copy/paste into Python and use
-// pickletools.dis and pickle.loads there to verify a pickle is handy.
+// pickletools.dis and pickle.loads there to verify a pickle is also handy.
 func pyquote(s string) string {
 	const hexdigits = "0123456789abcdef"
 	out := make([]byte, 0, len(s))
@@ -55,3 +59,83 @@ func pyquote(s string) string {
 	return "\"" + string(out) + "\""
 }
+// pydecodeStringEscape decodes input according to "string-escape" Python codec.
+//
+// The codec is essentially defined here:
+// https://github.com/python/cpython/blob/v2.7.15-198-g69d0bc1430d/Objects/stringobject.c#L600
+func pydecodeStringEscape(s string) (string, error) {
+	out := make([]byte, 0, len(s))
+loop:
+	for {
+		r, width := utf8.DecodeRuneInString(s)
+		if width == 0 {
+			break
+		}
+		// regular UTF-8 character
+		if r != '\\' {
+			out = append(out, s[:width]...)
+			s = s[width:]
+			continue
+		}
+		if len(s) < 2 {
+			return "", strconv.ErrSyntax
+		}
+		switch c := s[1]; c {
+		// \ LF -> just skip
+		case '\n':
+			s = s[2:]
+			continue loop
+		// \\ -> \
+		case '\\':
+			out = append(out, '\\')
+			s = s[2:]
+			continue loop
+		// \' \"  (yes, both quotes are allowed to be escaped).
+		//
+		// also: both quotes are allowed to be _unescaped_ - e.g. Python
+		// unpickles "S'hel'lo'\n." as "hel'lo".
+		case '\'', '"':
+			out = append(out, c)
+			s = s[2:]
+			continue loop
+		// \c (any character without special meaning) -> \ and proceed with C
+		default:
+			out = append(out, '\\')
+			s = s[1:] // not skipping c
+			continue loop
+		// escapes we handle (NOTE no \u \U for strings)
+		case 'b','f','t','n','r','v','a':     // control characters
+		case '0','1','2','3','4','5','6','7': // octals
+	        case 'x':                             // hex
+		}
+		// s starts with a good/known string escape prefix -> reuse unquoteChar.
+		r, _, tail, err := strconv.UnquoteChar(s, 0)
+		if err != nil {
+			return "", err
+		}
+		// all above escapes must produce single byte. This way we can
+		// append it directly, not play rune -> string UTF-8 encoding
+		// games (which break on e.g. "\x80" -> "\u0080" (= "\xc2x80").
+		c := byte(r)
+		if r != rune(c) {
+			panic(fmt.Sprintf("pydecode: string-escape: non-byte escaped rune %q (% x  ; from %q)",
+				r, r, s))
+		}
+		out = append(out, c)
+		s = tail
+	}
+	return string(out), nil
+}
--- a/pyquote_test.go
+++ b/pyquote_test.go
+package ogórek
+import (
+	"testing"
+)
+// CodecTestCase represents 1 test case of a coder or decoder.
+//
+// Under the given transformation function in must be transformed to out.
+type CodecTestCase struct {
+	in, out string
+}
+// testCodec tests transform func applied to all test cases from testv.
+func testCodec(t *testing.T, transform func(in string)(string, error), testv []CodecTestCase) {
+	for _, tt := range testv {
+		s, err := transform(tt.in)
+		if err != nil {
+			t.Errorf("%q -> error: %s", tt.in, err)
+			continue
+		}
+		if s != tt.out {
+			t.Errorf("%q -> unexpected:\nhave: %q\nwant: %q", tt.in, s, tt.out)
+		}
+	}
+}
+func TestPyDecodeStringEscape(t *testing.T) {
+	testCodec(t, pydecodeStringEscape, []CodecTestCase{
+		{`hello`, "hello"},
+		{"hello\\\nworld", "helloworld"},
+		{`\\`, `\`},
+		{`\'\"`, `'"`},
+		{`\b\f\t\n\r\v\a`, "\b\f\t\n\r\v\a"},
+		{`\000\001\376\377`, "\000\001\376\377"},
+		{`\x00\x01\x7f\x80\xfe\xff`, "\x00\x01\x7f\x80\xfe\xff"},
+		// vvv stays as is
+		{`\u1234\U00001234\c`, `\u1234\U00001234\c`},
+	})
+}