Commit bd80565f authored by Kevin Modzelewski's avatar Kevin Modzelewski

Support unicode literals in source files

Currently storing + passing unicode strings around as UTF-encoded bytestrings
in std::string; maybe it'd be nice to have the type system show that these are
actually unicode strings, or to use the CPython internal representation (UCS4?)
to reduce the number of encodings/decodings.
parent e0c1a8d9
...@@ -477,7 +477,20 @@ extern "C" int PyObject_IsSubclass(PyObject* derived, PyObject* cls) noexcept { ...@@ -477,7 +477,20 @@ extern "C" int PyObject_IsSubclass(PyObject* derived, PyObject* cls) noexcept {
} }
extern "C" PyObject* _PyObject_CallFunction_SizeT(PyObject* callable, const char* format, ...) noexcept { extern "C" PyObject* _PyObject_CallFunction_SizeT(PyObject* callable, const char* format, ...) noexcept {
Py_FatalError("unimplemented"); va_list va;
PyObject* args;
if (callable == NULL)
return null_error();
if (format && *format) {
va_start(va, format);
args = _Py_VaBuildValue_SizeT(format, va);
va_end(va);
} else
args = PyTuple_New(0);
return call_function_tail(callable, args);
} }
#define NEW_STYLE_NUMBER(o) PyType_HasFeature((o)->cls, Py_TPFLAGS_CHECKTYPES) #define NEW_STYLE_NUMBER(o) PyType_HasFeature((o)->cls, Py_TPFLAGS_CHECKTYPES)
......
...@@ -69,6 +69,18 @@ static int countformat(const char* format, int endchar) noexcept { ...@@ -69,6 +69,18 @@ static int countformat(const char* format, int endchar) noexcept {
return count; return count;
} }
#ifdef Py_USING_UNICODE
static int _ustrlen(Py_UNICODE* u) {
int i = 0;
Py_UNICODE* v = u;
while (*v != 0) {
i++;
v++;
}
return i;
}
#endif
static PyObject* do_mktuple(const char**, va_list*, int, int, int) noexcept; static PyObject* do_mktuple(const char**, va_list*, int, int, int) noexcept;
// static PyObject *do_mklist(const char**, va_list *, int, int, int) noexcept; // static PyObject *do_mklist(const char**, va_list *, int, int, int) noexcept;
// static PyObject *do_mkdict(const char**, va_list *, int, int, int) noexcept; // static PyObject *do_mkdict(const char**, va_list *, int, int, int) noexcept;
...@@ -162,7 +174,30 @@ static PyObject* do_mkvalue(const char** p_format, va_list* p_va, int flags) noe ...@@ -162,7 +174,30 @@ static PyObject* do_mkvalue(const char** p_format, va_list* p_va, int flags) noe
} }
return v; return v;
} }
#ifdef Py_USING_UNICODE
case 'u': {
PyObject* v;
Py_UNICODE* u = va_arg(*p_va, Py_UNICODE*);
Py_ssize_t n;
if (**p_format == '#') {
++*p_format;
if (flags & FLAG_SIZE_T)
n = va_arg(*p_va, Py_ssize_t);
else
n = va_arg(*p_va, int);
} else
n = -1;
if (u == NULL) {
v = Py_None;
Py_INCREF(v);
} else {
if (n < 0)
n = _ustrlen(u);
v = PyUnicode_FromUnicode(u, n);
}
return v;
}
#endif
default: default:
RELEASE_ASSERT(0, "%c", *((*p_format) - 1)); RELEASE_ASSERT(0, "%c", *((*p_format) - 1));
} }
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "core/stats.h" #include "core/stats.h"
#include "core/thread_utils.h" #include "core/thread_utils.h"
#include "core/util.h" #include "core/util.h"
#include "runtime/capi.h"
#include "runtime/generator.h" #include "runtime/generator.h"
#include "runtime/import.h" #include "runtime/import.h"
#include "runtime/inline/boxing.h" #include "runtime/inline/boxing.h"
...@@ -497,7 +498,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) { ...@@ -497,7 +498,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) {
assert(node->args[1]->type == AST_TYPE::Str); assert(node->args[1]->type == AST_TYPE::Str);
Value module = visit_expr(node->args[0]); Value module = visit_expr(node->args[0]);
const std::string& name = ast_cast<AST_Str>(node->args[1])->s; auto ast_str = ast_cast<AST_Str>(node->args[1]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& name = ast_str->str_data;
assert(name.size()); assert(name.size());
v = importFrom(module.o, &name); v = importFrom(module.o, &name);
} else if (node->opcode == AST_LangPrimitive::IMPORT_NAME) { } else if (node->opcode == AST_LangPrimitive::IMPORT_NAME) {
...@@ -508,7 +511,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) { ...@@ -508,7 +511,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) {
int level = static_cast<AST_Num*>(node->args[0])->n_int; int level = static_cast<AST_Num*>(node->args[0])->n_int;
Value froms = visit_expr(node->args[1]); Value froms = visit_expr(node->args[1]);
const std::string& module_name = static_cast<AST_Str*>(node->args[2])->s; auto ast_str = ast_cast<AST_Str>(node->args[2]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& module_name = ast_str->str_data;
v = import(level, froms.o, &module_name); v = import(level, froms.o, &module_name);
} else if (node->opcode == AST_LangPrimitive::IMPORT_STAR) { } else if (node->opcode == AST_LangPrimitive::IMPORT_STAR) {
assert(node->args.size() == 1); assert(node->args.size() == 1);
...@@ -996,7 +1001,13 @@ Value ASTInterpreter::visit_set(AST_Set* node) { ...@@ -996,7 +1001,13 @@ Value ASTInterpreter::visit_set(AST_Set* node) {
} }
Value ASTInterpreter::visit_str(AST_Str* node) { Value ASTInterpreter::visit_str(AST_Str* node) {
return boxString(node->s); if (node->str_type == AST_Str::STR) {
return boxString(node->str_data);
} else if (node->str_type == AST_Str::UNICODE) {
return decodeUTF8StringPtr(&node->str_data);
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
} }
Value ASTInterpreter::visit_name(AST_Name* node) { Value ASTInterpreter::visit_name(AST_Name* node) {
......
...@@ -1798,6 +1798,12 @@ CompilerVariable* makeStr(const std::string* s) { ...@@ -1798,6 +1798,12 @@ CompilerVariable* makeStr(const std::string* s) {
return new ValuedCompilerVariable<const std::string*>(STR_CONSTANT, s, true); return new ValuedCompilerVariable<const std::string*>(STR_CONSTANT, s, true);
} }
CompilerVariable* makeUnicode(IREmitter& emitter, const std::string* s) {
llvm::Value* boxed
= emitter.getBuilder()->CreateCall(g.funcs.decodeUTF8StringPtr, embedConstantPtr(s, g.llvm_str_type_ptr));
return new ConcreteCompilerVariable(typeFromClass(unicode_cls), boxed, true);
}
class VoidType : public ConcreteCompilerType { class VoidType : public ConcreteCompilerType {
public: public:
llvm::Type* llvmType() override { return g.void_; } llvm::Type* llvmType() override { return g.void_; }
......
...@@ -379,6 +379,7 @@ ConcreteCompilerVariable* makeBool(bool); ...@@ -379,6 +379,7 @@ ConcreteCompilerVariable* makeBool(bool);
ConcreteCompilerVariable* makeLong(IREmitter& emitter, std::string&); ConcreteCompilerVariable* makeLong(IREmitter& emitter, std::string&);
ConcreteCompilerVariable* makePureImaginary(IREmitter& emitter, double imag); ConcreteCompilerVariable* makePureImaginary(IREmitter& emitter, double imag);
CompilerVariable* makeStr(const std::string*); CompilerVariable* makeStr(const std::string*);
CompilerVariable* makeUnicode(IREmitter& emitter, const std::string*);
CompilerVariable* makeFunction(IREmitter& emitter, CLFunction*, CompilerVariable* closure, bool isGenerator, CompilerVariable* makeFunction(IREmitter& emitter, CLFunction*, CompilerVariable* closure, bool isGenerator,
const std::vector<ConcreteCompilerVariable*>& defaults); const std::vector<ConcreteCompilerVariable*>& defaults);
ConcreteCompilerVariable* undefVariable(); ConcreteCompilerVariable* undefVariable();
......
...@@ -515,7 +515,9 @@ private: ...@@ -515,7 +515,9 @@ private:
ConcreteCompilerVariable* converted_module = module->makeConverted(emitter, module->getBoxType()); ConcreteCompilerVariable* converted_module = module->makeConverted(emitter, module->getBoxType());
module->decvref(emitter); module->decvref(emitter);
const std::string& name = ast_cast<AST_Str>(node->args[1])->s; auto ast_str = ast_cast<AST_Str>(node->args[1]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& name = ast_str->str_data;
assert(name.size()); assert(name.size());
llvm::Value* r = emitter.createCall2(unw_info, g.funcs.importFrom, converted_module->getValue(), llvm::Value* r = emitter.createCall2(unw_info, g.funcs.importFrom, converted_module->getValue(),
...@@ -558,7 +560,9 @@ private: ...@@ -558,7 +560,9 @@ private:
ConcreteCompilerVariable* converted_froms = froms->makeConverted(emitter, froms->getBoxType()); ConcreteCompilerVariable* converted_froms = froms->makeConverted(emitter, froms->getBoxType());
froms->decvref(emitter); froms->decvref(emitter);
const std::string& module_name = static_cast<AST_Str*>(node->args[2])->s; auto ast_str = ast_cast<AST_Str>(node->args[2]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& module_name = ast_str->str_data;
llvm::Value* imported = emitter.createCall3(unw_info, g.funcs.import, getConstantInt(level, g.i32), llvm::Value* imported = emitter.createCall3(unw_info, g.funcs.import, getConstantInt(level, g.i32),
converted_froms->getValue(), converted_froms->getValue(),
...@@ -1008,7 +1012,15 @@ private: ...@@ -1008,7 +1012,15 @@ private:
return new ConcreteCompilerVariable(SLICE, rtn, true); return new ConcreteCompilerVariable(SLICE, rtn, true);
} }
CompilerVariable* evalStr(AST_Str* node, UnwindInfo unw_info) { return makeStr(&node->s); } CompilerVariable* evalStr(AST_Str* node, UnwindInfo unw_info) {
if (node->str_type == AST_Str::STR) {
return makeStr(&node->str_data);
} else if (node->str_type == AST_Str::UNICODE) {
return makeUnicode(emitter, &node->str_data);
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
}
CompilerVariable* evalSubscript(AST_Subscript* node, UnwindInfo unw_info) { CompilerVariable* evalSubscript(AST_Subscript* node, UnwindInfo unw_info) {
CompilerVariable* value = evalExpr(node->value, unw_info); CompilerVariable* value = evalExpr(node->value, unw_info);
......
...@@ -154,7 +154,7 @@ def convert(n, f): ...@@ -154,7 +154,7 @@ def convert(n, f):
elif isinstance(v, str): elif isinstance(v, str):
_print_str(v, f) _print_str(v, f)
elif isinstance(v, unicode): elif isinstance(v, unicode):
_print_str(v.encode("ascii"), f) _print_str(v.encode("utf8"), f)
elif isinstance(v, bool): elif isinstance(v, bool):
f.write(struct.pack("B", v)) f.write(struct.pack("B", v))
elif isinstance(v, int): elif isinstance(v, int):
......
...@@ -661,12 +661,9 @@ AST_Str* read_str(BufferedReader* reader) { ...@@ -661,12 +661,9 @@ AST_Str* read_str(BufferedReader* reader) {
rtn->lineno = reader->readULL(); rtn->lineno = reader->readULL();
if (rtn->str_type == AST_Str::STR) { if (rtn->str_type == AST_Str::STR) {
rtn->s = readString(reader); rtn->str_data = readString(reader);
} else if (rtn->str_type == AST_Str::UNICODE) { } else if (rtn->str_type == AST_Str::UNICODE) {
// Don't really support unicode for now... rtn->str_data = readString(reader);
printf("Warning: converting unicode literal to str\n");
rtn->str_type = AST_Str::STR;
rtn->s = readString(reader);
} else { } else {
RELEASE_ASSERT(0, "%d", rtn->str_type); RELEASE_ASSERT(0, "%d", rtn->str_type);
} }
......
...@@ -504,7 +504,8 @@ struct expr_dispatcher { ...@@ -504,7 +504,8 @@ struct expr_dispatcher {
ResultPtr read(pypa::AstStr& s) { ResultPtr read(pypa::AstStr& s) {
AST_Str* ptr = new AST_Str(); AST_Str* ptr = new AST_Str();
location(ptr, s); location(ptr, s);
ptr->s = s.value; ptr->str_type = AST_Str::STR;
ptr->str_data = s.value;
return ptr; return ptr;
} }
...@@ -792,7 +793,7 @@ struct stmt_dispatcher { ...@@ -792,7 +793,7 @@ struct stmt_dispatcher {
AST_Str* str = new AST_Str(); AST_Str* str = new AST_Str();
ptr->value = str; ptr->value = str;
str->str_type = AST_Str::STR; str->str_type = AST_Str::STR;
str->s = d.doc; str->str_data = d.doc;
return ptr; return ptr;
} }
}; };
......
...@@ -184,6 +184,7 @@ void initGlobalFuncs(GlobalState& g) { ...@@ -184,6 +184,7 @@ void initGlobalFuncs(GlobalState& g) {
GET(createLong); GET(createLong);
GET(createPureImaginary); GET(createPureImaginary);
GET(createSet); GET(createSet);
GET(decodeUTF8StringPtr);
GET(getattr); GET(getattr);
GET(setattr); GET(setattr);
......
...@@ -34,7 +34,8 @@ struct GlobalFuncs { ...@@ -34,7 +34,8 @@ struct GlobalFuncs {
llvm::Value* boxInt, *unboxInt, *boxFloat, *unboxFloat, *boxStringPtr, *boxCLFunction, *unboxCLFunction, llvm::Value* boxInt, *unboxInt, *boxFloat, *unboxFloat, *boxStringPtr, *boxCLFunction, *unboxCLFunction,
*boxInstanceMethod, *boxBool, *unboxBool, *createTuple, *createDict, *createList, *createSlice, *boxInstanceMethod, *boxBool, *unboxBool, *createTuple, *createDict, *createList, *createSlice,
*createUserClass, *createClosure, *createGenerator, *createLong, *createSet, *createPureImaginary; *createUserClass, *createClosure, *createGenerator, *createLong, *createSet, *createPureImaginary,
*decodeUTF8StringPtr;
llvm::Value* getattr, *setattr, *delattr, *delitem, *delGlobal, *nonzero, *binop, *compare, *augbinop, *unboxedLen, llvm::Value* getattr, *setattr, *delattr, *delitem, *delGlobal, *nonzero, *binop, *compare, *augbinop, *unboxedLen,
*getitem, *getclsattr, *getGlobal, *setitem, *unaryop, *import, *importFrom, *importStar, *repr, *str, *getitem, *getclsattr, *getGlobal, *setitem, *unaryop, *import, *importFrom, *importStar, *repr, *str,
*isinstance, *yield, *getPystonIter; *isinstance, *yield, *getPystonIter;
......
...@@ -1627,7 +1627,13 @@ bool PrintVisitor::visit_slice(AST_Slice* node) { ...@@ -1627,7 +1627,13 @@ bool PrintVisitor::visit_slice(AST_Slice* node) {
} }
bool PrintVisitor::visit_str(AST_Str* node) { bool PrintVisitor::visit_str(AST_Str* node) {
printf("\"%s\"", node->s.c_str()); if (node->str_type == AST_Str::STR) {
printf("\"%s\"", node->str_data.c_str());
} else if (node->str_type == AST_Str::UNICODE) {
printf("<unicode value>");
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
return false; return false;
} }
......
...@@ -816,18 +816,21 @@ public: ...@@ -816,18 +816,21 @@ public:
class AST_Str : public AST_expr { class AST_Str : public AST_expr {
public: public:
enum StrType { enum StrType {
UNSET = 0x00,
STR = 0x10, STR = 0x10,
UNICODE = 0x20, UNICODE = 0x20,
} str_type; } str_type;
std::string s; // The meaning of str_data depends on str_type. For STR, it's just the bytes value.
// For UNICODE, it's the utf-8 encoded value.
std::string str_data;
virtual void accept(ASTVisitor* v); virtual void accept(ASTVisitor* v);
virtual void* accept_expr(ExprVisitor* v); virtual void* accept_expr(ExprVisitor* v);
AST_Str() : AST_expr(AST_TYPE::Str) {} AST_Str() : AST_expr(AST_TYPE::Str), str_type(UNSET) {}
AST_Str(const std::string& s) : AST_expr(AST_TYPE::Str), str_type(STR), s(s) {} AST_Str(const std::string& s) : AST_expr(AST_TYPE::Str), str_type(STR), str_data(s) {}
AST_Str(const std::string&& s) : AST_expr(AST_TYPE::Str), str_type(STR), s(std::move(s)) {} AST_Str(const std::string&& s) : AST_expr(AST_TYPE::Str), str_type(STR), str_data(std::move(s)) {}
static const AST_TYPE::AST_TYPE TYPE = AST_TYPE::Str; static const AST_TYPE::AST_TYPE TYPE = AST_TYPE::Str;
}; };
......
...@@ -594,7 +594,7 @@ private: ...@@ -594,7 +594,7 @@ private:
AST_Str* orig = ast_cast<AST_Str>(val); AST_Str* orig = ast_cast<AST_Str>(val);
AST_Str* made = new AST_Str(); AST_Str* made = new AST_Str();
made->str_type = orig->str_type; made->str_type = orig->str_type;
made->s = orig->s; made->str_data = orig->str_data;
made->col_offset = orig->col_offset; made->col_offset = orig->col_offset;
made->lineno = orig->lineno; made->lineno = orig->lineno;
return made; return made;
......
...@@ -347,16 +347,40 @@ extern "C" Box* chr(Box* arg) { ...@@ -347,16 +347,40 @@ extern "C" Box* chr(Box* arg) {
return boxString(std::string(1, (char)n)); return boxString(std::string(1, (char)n));
} }
extern "C" Box* ord(Box* arg) { extern "C" Box* ord(Box* obj) {
if (arg->cls != str_cls) { long ord;
raiseExcHelper(TypeError, "ord() expected string of length 1, but %s found", getTypeName(arg)); Py_ssize_t size;
}
const std::string& s = static_cast<BoxedString*>(arg)->s; if (PyString_Check(obj)) {
size = PyString_GET_SIZE(obj);
if (size == 1) {
ord = (long)((unsigned char)*PyString_AS_STRING(obj));
return new BoxedInt(ord);
}
} else if (PyByteArray_Check(obj)) {
size = PyByteArray_GET_SIZE(obj);
if (size == 1) {
ord = (long)((unsigned char)*PyByteArray_AS_STRING(obj));
return new BoxedInt(ord);
}
if (s.size() != 1) #ifdef Py_USING_UNICODE
raiseExcHelper(TypeError, "ord() expected string of length 1, but string of length %d found", s.size()); } else if (PyUnicode_Check(obj)) {
size = PyUnicode_GET_SIZE(obj);
if (size == 1) {
ord = (long)*PyUnicode_AS_UNICODE(obj);
return new BoxedInt(ord);
}
#endif
} else {
raiseExcHelper(TypeError, "ord() expected string of length 1, but "
"%.200s found",
obj->cls->tp_name);
}
return boxInt(s[0]); raiseExcHelper(TypeError, "ord() expected a character, "
"but string of length %zd found",
size);
} }
Box* range(Box* start, Box* stop, Box* step) { Box* range(Box* start, Box* stop, Box* step) {
......
...@@ -65,6 +65,7 @@ void force() { ...@@ -65,6 +65,7 @@ void force() {
FORCE(createLong); FORCE(createLong);
FORCE(createPureImaginary); FORCE(createPureImaginary);
FORCE(createSet); FORCE(createSet);
FORCE(decodeUTF8StringPtr);
FORCE(getattr); FORCE(getattr);
FORCE(setattr); FORCE(setattr);
......
...@@ -1172,6 +1172,13 @@ extern "C" PyObject* PyObject_Init(PyObject* op, PyTypeObject* tp) noexcept { ...@@ -1172,6 +1172,13 @@ extern "C" PyObject* PyObject_Init(PyObject* op, PyTypeObject* tp) noexcept {
return op; return op;
} }
Box* decodeUTF8StringPtr(const std::string* s) {
Box* rtn = PyUnicode_DecodeUTF8(s->c_str(), s->size(), "strict");
checkAndThrowCAPIException();
assert(rtn);
return rtn;
}
bool TRACK_ALLOCATIONS = false; bool TRACK_ALLOCATIONS = false;
void setupRuntime() { void setupRuntime() {
root_hcls = HiddenClass::makeRoot(); root_hcls = HiddenClass::makeRoot();
......
...@@ -108,6 +108,7 @@ Box* boxString(const std::string& s); ...@@ -108,6 +108,7 @@ Box* boxString(const std::string& s);
Box* boxString(std::string&& s); Box* boxString(std::string&& s);
extern "C" BoxedString* boxStrConstant(const char* chars); extern "C" BoxedString* boxStrConstant(const char* chars);
extern "C" BoxedString* boxStrConstantSize(const char* chars, size_t n); extern "C" BoxedString* boxStrConstantSize(const char* chars, size_t n);
extern "C" Box* decodeUTF8StringPtr(const std::string* s);
// creates an uninitialized string of length n; useful for directly constructing into the string and avoiding copies: // creates an uninitialized string of length n; useful for directly constructing into the string and avoiding copies:
BoxedString* createUninitializedString(ssize_t n); BoxedString* createUninitializedString(ssize_t n);
......
# skip-if: '-x' in EXTRA_JIT_ARGS
# allow-warning: import level 0 will be treated as -1
print repr(unicode()) print repr(unicode())
print repr(unicode('hello world')) print repr(unicode('hello world'))
# Some random unicode character:
u = u'\u0180'
print len(u)
print repr(u)
print repr(u.encode("utf8"))
# This is tricky, since we need to support file encodings, and then set stdout to UTF8:
# print u
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment