Commit 3bfae1c9 authored by Kevin Modzelewski's avatar Kevin Modzelewski

str improvements

- Allow unicode arguments to str.encode/str.decode
- Detect float parsing errors
- 'VAR_WITH_UNDERSCORES'.isupper() should be True
- str.rpartition()
- Switch to CPythons str.split
parent a3683b3e
...@@ -23,6 +23,33 @@ PyObject * _do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) { ...@@ -23,6 +23,33 @@ PyObject * _do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) {
return do_string_format(self, args, kwargs); return do_string_format(self, args, kwargs);
} }
PyObject * string_split(PyStringObject *self, PyObject *args)
{
Py_ssize_t len = PyString_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
PyObject *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(subobj))
return PyUnicode_Split((PyObject *)self, subobj, maxsplit);
#endif
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
return stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
}
PyObject* string_rsplit(PyStringObject* self, PyObject* args) { PyObject* string_rsplit(PyStringObject* self, PyObject* args) {
Py_ssize_t len = PyString_GET_SIZE(self), n; Py_ssize_t len = PyString_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1; Py_ssize_t maxsplit = -1;
......
...@@ -597,7 +597,13 @@ BoxedFloat* _floatNew(Box* a) { ...@@ -597,7 +597,13 @@ BoxedFloat* _floatNew(Box* a) {
if (s == "-inf") if (s == "-inf")
return new BoxedFloat(-INFINITY); return new BoxedFloat(-INFINITY);
return new BoxedFloat(strtod(s.c_str(), NULL)); // TODO this should just use CPython's implementation:
char* endptr;
const char* startptr = s.c_str();
double r = strtod(startptr, &endptr);
if (endptr != startptr + s.size())
raiseExcHelper(ValueError, "could not convert string to float: %s", s.c_str());
return new BoxedFloat(r);
} else { } else {
static const std::string float_str("__float__"); static const std::string float_str("__float__");
Box* r = callattr(a, &float_str, CallattrFlags({.cls_only = true, .null_on_nonexistent = true }), Box* r = callattr(a, &float_str, CallattrFlags({.cls_only = true, .null_on_nonexistent = true }),
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "runtime/types.h" #include "runtime/types.h"
#include "runtime/util.h" #include "runtime/util.h"
extern "C" PyObject* string_split(PyStringObject* self, PyObject* args) noexcept;
extern "C" PyObject* string_rsplit(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_rsplit(PyStringObject* self, PyObject* args) noexcept;
extern "C" PyObject* string_find(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_find(PyStringObject* self, PyObject* args) noexcept;
extern "C" PyObject* string_rfind(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_rfind(PyStringObject* self, PyObject* args) noexcept;
...@@ -1589,22 +1590,19 @@ Box* strIsUpper(BoxedString* self) { ...@@ -1589,22 +1590,19 @@ Box* strIsUpper(BoxedString* self) {
assert(isSubclass(self->cls, str_cls)); assert(isSubclass(self->cls, str_cls));
const std::string& str(self->s); const std::string& str(self->s);
bool uppered = false;
if (str.empty()) if (str.empty())
return False; return False;
bool cased = false;
for (const auto& c : str) { for (const auto& c : str) {
if (std::isspace(c) || std::isdigit(c)) { if (std::islower(c))
continue;
} else if (!std::isupper(c)) {
return False; return False;
} else { else if (!cased && isupper(c))
uppered = true; cased = true;
}
} }
return boxBool(uppered); return boxBool(cased);
} }
Box* strIsSpace(BoxedString* self) { Box* strIsSpace(BoxedString* self) {
...@@ -1734,6 +1732,21 @@ Box* strPartition(BoxedString* self, BoxedString* sep) { ...@@ -1734,6 +1732,21 @@ Box* strPartition(BoxedString* self, BoxedString* sep) {
self->s.size() - found_idx - sep->s.size()) }); self->s.size() - found_idx - sep->s.size()) });
} }
Box* strRpartition(BoxedString* self, BoxedString* sep) {
RELEASE_ASSERT(isSubclass(self->cls, str_cls), "");
RELEASE_ASSERT(isSubclass(sep->cls, str_cls), "");
size_t found_idx = self->s.rfind(sep->s);
if (found_idx == std::string::npos)
return new BoxedTuple({ self, boxStrConstant(""), boxStrConstant("") });
return new BoxedTuple({ boxStrConstantSize(self->s.c_str(), found_idx),
boxStrConstantSize(self->s.c_str() + found_idx, sep->s.size()),
boxStrConstantSize(self->s.c_str() + found_idx + sep->s.size(),
self->s.size() - found_idx - sep->s.size()) });
}
extern "C" PyObject* _do_string_format(PyObject* self, PyObject* args, PyObject* kwargs); extern "C" PyObject* _do_string_format(PyObject* self, PyObject* args, PyObject* kwargs);
Box* strFormat(BoxedString* self, BoxedTuple* args, BoxedDict* kwargs) { Box* strFormat(BoxedString* self, BoxedTuple* args, BoxedDict* kwargs) {
...@@ -1746,47 +1759,6 @@ Box* strFormat(BoxedString* self, BoxedTuple* args, BoxedDict* kwargs) { ...@@ -1746,47 +1759,6 @@ Box* strFormat(BoxedString* self, BoxedTuple* args, BoxedDict* kwargs) {
return rtn; return rtn;
} }
Box* strSplit(BoxedString* self, BoxedString* sep, BoxedInt* _max_split) {
assert(isSubclass(self->cls, str_cls));
if (_max_split->cls != int_cls)
raiseExcHelper(TypeError, "an integer is required");
if (isSubclass(sep->cls, str_cls)) {
if (!sep->s.empty()) {
llvm::SmallVector<llvm::StringRef, 16> parts;
llvm::StringRef(self->s).split(parts, sep->s, _max_split->n);
BoxedList* rtn = new BoxedList();
for (const auto& s : parts)
listAppendInternal(rtn, boxString(s.str()));
return rtn;
} else {
raiseExcHelper(ValueError, "empty separator");
}
} else if (sep->cls == none_cls) {
RELEASE_ASSERT(_max_split->n < 0, "this case hasn't been updated to handle limited splitting amounts");
BoxedList* rtn = new BoxedList();
std::ostringstream os("");
for (char c : self->s) {
if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v') {
if (os.tellp()) {
listAppendInternal(rtn, boxString(os.str()));
os.str("");
}
} else {
os << c;
}
}
if (os.tellp()) {
listAppendInternal(rtn, boxString(os.str()));
}
return rtn;
} else {
raiseExcHelper(TypeError, "expected a character buffer object");
}
}
Box* strStrip(BoxedString* self, Box* chars) { Box* strStrip(BoxedString* self, Box* chars) {
assert(isSubclass(self->cls, str_cls)); assert(isSubclass(self->cls, str_cls));
...@@ -2070,9 +2042,15 @@ Box* strDecode(BoxedString* self, Box* encoding, Box* error) { ...@@ -2070,9 +2042,15 @@ Box* strDecode(BoxedString* self, Box* encoding, Box* error) {
BoxedString* encoding_str = (BoxedString*)encoding; BoxedString* encoding_str = (BoxedString*)encoding;
BoxedString* error_str = (BoxedString*)error; BoxedString* error_str = (BoxedString*)error;
if (encoding_str && encoding_str->cls == unicode_cls)
encoding_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(encoding_str, NULL);
if (encoding_str && encoding_str->cls != str_cls) if (encoding_str && encoding_str->cls != str_cls)
raiseExcHelper(TypeError, "decode() argument 1 must be string, not '%s'", getTypeName(encoding_str)); raiseExcHelper(TypeError, "decode() argument 1 must be string, not '%s'", getTypeName(encoding_str));
if (error_str && error_str->cls == unicode_cls)
error_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(error_str, NULL);
if (error_str && error_str->cls != str_cls) if (error_str && error_str->cls != str_cls)
raiseExcHelper(TypeError, "decode() argument 2 must be string, not '%s'", getTypeName(error_str)); raiseExcHelper(TypeError, "decode() argument 2 must be string, not '%s'", getTypeName(error_str));
...@@ -2089,9 +2067,15 @@ Box* strEncode(BoxedString* self, Box* encoding, Box* error) { ...@@ -2089,9 +2067,15 @@ Box* strEncode(BoxedString* self, Box* encoding, Box* error) {
BoxedString* encoding_str = (BoxedString*)encoding; BoxedString* encoding_str = (BoxedString*)encoding;
BoxedString* error_str = (BoxedString*)error; BoxedString* error_str = (BoxedString*)error;
if (encoding_str && encoding_str->cls == unicode_cls)
encoding_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(encoding_str, NULL);
if (encoding_str && encoding_str->cls != str_cls) if (encoding_str && encoding_str->cls != str_cls)
raiseExcHelper(TypeError, "encode() argument 1 must be string, not '%s'", getTypeName(encoding_str)); raiseExcHelper(TypeError, "encode() argument 1 must be string, not '%s'", getTypeName(encoding_str));
if (error_str && error_str->cls == unicode_cls)
error_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(error_str, NULL);
if (error_str && error_str->cls != str_cls) if (error_str && error_str->cls != str_cls)
raiseExcHelper(TypeError, "encode() argument 2 must be string, not '%s'", getTypeName(error_str)); raiseExcHelper(TypeError, "encode() argument 2 must be string, not '%s'", getTypeName(error_str));
...@@ -2449,6 +2433,7 @@ void strDestructor(Box* b) { ...@@ -2449,6 +2433,7 @@ void strDestructor(Box* b) {
} }
static PyMethodDef string_methods[] = { static PyMethodDef string_methods[] = {
{ "split", (PyCFunction)string_split, METH_VARARGS, NULL },
{ "rsplit", (PyCFunction)string_rsplit, METH_VARARGS, NULL }, { "rsplit", (PyCFunction)string_rsplit, METH_VARARGS, NULL },
{ "find", (PyCFunction)string_find, METH_VARARGS, NULL }, { "find", (PyCFunction)string_find, METH_VARARGS, NULL },
{ "rfind", (PyCFunction)string_rfind, METH_VARARGS, NULL }, { "rfind", (PyCFunction)string_rfind, METH_VARARGS, NULL },
...@@ -2509,6 +2494,7 @@ void setupStr() { ...@@ -2509,6 +2494,7 @@ void setupStr() {
new BoxedFunction(boxRTFunction((void*)strEndswith, BOXED_BOOL, 4, 2, 0, 0), { NULL, NULL })); new BoxedFunction(boxRTFunction((void*)strEndswith, BOXED_BOOL, 4, 2, 0, 0), { NULL, NULL }));
str_cls->giveAttr("partition", new BoxedFunction(boxRTFunction((void*)strPartition, UNKNOWN, 2))); str_cls->giveAttr("partition", new BoxedFunction(boxRTFunction((void*)strPartition, UNKNOWN, 2)));
str_cls->giveAttr("rpartition", new BoxedFunction(boxRTFunction((void*)strRpartition, UNKNOWN, 2)));
str_cls->giveAttr("format", new BoxedFunction(boxRTFunction((void*)strFormat, UNKNOWN, 1, 0, true, true))); str_cls->giveAttr("format", new BoxedFunction(boxRTFunction((void*)strFormat, UNKNOWN, 1, 0, true, true)));
...@@ -2542,9 +2528,6 @@ void setupStr() { ...@@ -2542,9 +2528,6 @@ void setupStr() {
str_cls->giveAttr("replace", str_cls->giveAttr("replace",
new BoxedFunction(boxRTFunction((void*)strReplace, UNKNOWN, 4, 1, false, false), { boxInt(-1) })); new BoxedFunction(boxRTFunction((void*)strReplace, UNKNOWN, 4, 1, false, false), { boxInt(-1) }));
str_cls->giveAttr(
"split", new BoxedFunction(boxRTFunction((void*)strSplit, LIST, 3, 2, false, false), { None, boxInt(-1) }));
for (auto& md : string_methods) { for (auto& md : string_methods) {
str_cls->giveAttr(md.ml_name, new BoxedMethodDescriptor(&md, str_cls)); str_cls->giveAttr(md.ml_name, new BoxedMethodDescriptor(&md, str_cls));
} }
......
...@@ -36,3 +36,17 @@ class F2(float): ...@@ -36,3 +36,17 @@ class F2(float):
print type(F2(D(F()))) print type(F2(D(F())))
print type(float(F())) print type(float(F()))
try:
f = float("hello world")
print f
except ValueError as e:
print e
try:
f = float("5 hello world")
print f
except ValueError as e:
pass
# We don't print the right thing yet:
# print e
...@@ -142,3 +142,6 @@ print repr("hello\tworld\t".expandtabs(12)) ...@@ -142,3 +142,6 @@ print repr("hello\tworld\t".expandtabs(12))
print "hello world".startswith(("x", "h")) print "hello world".startswith(("x", "h"))
print "hello world".endswith(("x", "h")) print "hello world".endswith(("x", "h"))
print "a.b.c.d".partition('.')
print "a.b.c.d".rpartition('.')
...@@ -93,6 +93,9 @@ for i in xrange(256): ...@@ -93,6 +93,9 @@ for i in xrange(256):
test(c) test(c)
test_is(c) test_is(c)
for j in xrange(i, 64):
test_is(c + chr(j))
try: try:
var = 'abc' var = 'abc'
var.isalnum(42) var.isalnum(42)
......
...@@ -37,6 +37,8 @@ print p(s.encode("utf8")) ...@@ -37,6 +37,8 @@ print p(s.encode("utf8"))
print p(s.encode("utf16")) print p(s.encode("utf16"))
print p(s.encode("utf32")) print p(s.encode("utf32"))
print p(s.encode("iso_8859_15")) print p(s.encode("iso_8859_15"))
print p(s.encode(u"utf8"))
print p("hello world".encode(u"utf8"))
print repr(u' '.join(["hello", "world"])) print repr(u' '.join(["hello", "world"]))
...@@ -91,3 +93,6 @@ print "hello world".startswith(u'world') ...@@ -91,3 +93,6 @@ print "hello world".startswith(u'world')
print float(u'1.0') print float(u'1.0')
print unichr(97) print unichr(97)
print "hello world".split(u'l')
print "hello world".rsplit(u'l')
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment