Commit ee6de4f8 authored by Kirill Smelkov's avatar Kirill Smelkov

Fix/Add support for []byte (= bytearray on Python side)

Starting from 2013 (from 555efd8f "first draft of dumb pickle encoder")
wrt []byte ogórek state was:

1. []byte was encoded as string
2. there was no way to decode a pickle object into []byte

then, as

- []byte encoding was never explicitly tested,
- nor I could find any usage of such encodings via searching through all Free /
  Open-Source software ogórek uses - I searched via "Uses" of NewEncoder on godoc:

  https://sourcegraph.com/github.com/kisielk/og-rek/-/blob/encode.go#L48:6=&tab=references:external

it is likely that []byte encoding support was added just for the sake of
it and convenience and then never used. It is also likely that the
original author does not use ogórek encoder anymore:

	https://github.com/kisielk/og-rek/pull/52#issuecomment-423639026

For those reasons I tend to think that it should be relatively safe to
change how []byte is handled:

- the need to change []byte handling is that currently []byte is a kind of
  exception: we can only encode it and not decode something into it.
  Currently encode/decode roundtrip for []byte gives string, which breaks
  the property of encode/decode being identity for all other basic types.

- on the similar topic, on encoding strings are assumed UTF-8 and are
  encoded as UNICODE opcodes for protocol >= 3. Passing arbitrary bytes
  there seems to be not good.

- on to how change []byte - sadly it cannot be used as Python's bytes
  counterpart. In fact in the previous patch we actually just added
  ogórek.Bytes as a counterpart for Python bytes. We did not used []byte
  for that because - contrary to Python bytes - []byte cannot be used as a
  dict key.

- the most natural counterpart for Go's []byte is thus Python's
  bytearray:

	https://docs.python.org/3/library/stdtypes.html#bytearray-objects

  which is "a mutable counterpart to bytes objects"

So add Python's bytearray decoding into []byte, and correspondingly
change []byte encoding to be encoded as bytearray.

P.S.

This changes encoder semantic wrt []byte. If some ogórek use breaks
somewhere because of it, we could add an encoder option to restore
backward compatible behaviour. However since I suspect noone was
actually encoding []byte into pickles, I prefer we wait for someone to
speak-up first instead of loading EncoderConfig with confusion options
that nobody will ever use.
parent 2a0b532f
......@@ -125,7 +125,7 @@ func (e *Encoder) encode(rv reflect.Value) error {
}
case reflect.Array, reflect.Slice:
if rv.Type().Elem().Kind() == reflect.Uint8 {
return e.encodeString(string(rv.Bytes()))
return e.encodeByteArray(rv.Bytes())
} else if _, ok := rv.Interface().(Tuple); ok {
return e.encodeTuple(rv.Interface().(Tuple))
} else {
......@@ -306,6 +306,16 @@ func (e *Encoder) encodeBytes(byt Bytes) error {
})
}
func (e *Encoder) encodeByteArray(bv []byte) error {
// TODO protocol <= 2: pickle can be shorter if we emit -> bytearray(unicode, encoding)
// instead of bytearray(_codecs.encode(unicode, encoding))
return e.encodeCall(&Call{
Callable: pybuiltin(e.config.Protocol, "bytearray"),
Args: Tuple{Bytes(bv)},
})
}
func (e *Encoder) encodeString(s string) error {
// protocol >= 3 -> encode string as unicode object
// (as python3 does)
......
......@@ -680,6 +680,32 @@ func (d *Decoder) handleCall(class Class, argv Tuple) error {
return nil
}
// handle bytearray(...) -> []byte(...)
if class == pybuiltin(d.protocol, "bytearray") {
// bytearray(bytes(...))
if len(argv) == 1 {
data, ok := argv[0].(Bytes)
if !ok {
return fmt.Errorf("bytearray: want (bytes,) ; got (%T,)", argv[0])
}
d.push([]byte(data))
return nil
}
// bytearray(unicode, encoding)
if len(argv) == 2 && argv[1] == "latin-1" {
// bytes as latin1-decode unicode
data, err := decodeLatin1Bytes(argv[0])
if err != nil {
return fmt.Errorf("bytearray: %s", err)
}
d.push([]byte(data))
return nil
}
}
return errCallNotHandled
}
......@@ -1308,3 +1334,13 @@ func decodeLatin1Bytes(arg interface{}) ([]byte, error) {
return data, nil
}
// pybuiltin returns Class corresponding to Python builtin name.
func pybuiltin(protocol int, name string) Class {
module := "builtins" // py3
if protocol <= 2 {
module = "__builtin__" // py2
}
return Class{Module: module, Name: name}
}
......@@ -293,6 +293,25 @@ var tests = []TestEntry{
P3_("C\x0dhello\nмир\x01."), // SHORT_BINBYTES
I("B\x0d\x00\x00\x00hello\nмир\x01.")), // BINBYTES
X(`bytearray(b"hello\nмир\x01")`, []byte("hello\nмир\x01"),
// GLOBAL + MARK + UNICODE + STRING + TUPLE + REDUCE
P0("c__builtin__\nbytearray\n(c_codecs\nencode\n(Vhello\\u000aмир\x01\nS\"latin1\"\ntRtR."),
// GLOBAL + MARK + BINUNICODE + SHORT_BINSTRING + TUPLE + REDUCE
P1("c__builtin__\nbytearray\n(c_codecs\nencode\n(X\x13\x00\x00\x00hello\nмиÑ\xc2\x80\x01U\x06latin1tRtR."),
// GLOBAL + BINUNICODE + SHORT_BINSTRING + TUPLE{2,1} + REDUCE
P2("c__builtin__\nbytearray\nc_codecs\nencode\nX\x13\x00\x00\x00hello\nмиÑ\xc2\x80\x01U\x06latin1\x86R\x85R."),
// PROTO + GLOBAL + SHORT_BINBYTES + TUPLE1 + REDUCE
P3("\x80\xffcbuiltins\nbytearray\nC\rhello\nмир\x01\x85R."),
// PROTO + SHORT_BINUNICODE + STACK_GLOBAL + SHORT_BINBYTES + TUPLE1 + REDUCE
P4_("\x80\xff\x8c\x08builtins\x8c\tbytearray\x93C\rhello\nмир\x01\x85R."),
// bytearray(text, encoding); GLOBAL + BINUNICODE + TUPLE + REDUCE
I("c__builtin__\nbytearray\nq\x00(X\x13\x00\x00\x00hello\n\xc3\x90\xc2\xbc\xc3\x90\xc2\xb8\xc3\x91\xc2\x80\x01q\x01X\x07\x00\x00\x00latin-1q\x02tq\x03Rq\x04.")),
X("dict({})", make(map[interface{}]interface{}),
P0("(d."), // MARK + DICT
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment