Merge pull request #542 from rntz/unwinder

unwinder

Merge pull request #542 from rntz/unwinder
unwinder
9de31610 · Kevin Modzelewski · 2c33f2ee · 1552ac34 · 9de31610 · 9de31610
Commit 9de31610 authored May 22, 2015 by Kevin Modzelewski
35 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "libunwind"]
 	path = libunwind
 	url = git://git.sv.gnu.org/libunwind.git
+        ignore = all
 [submodule "libpypa"]
 	path = libpypa
 	url = git://github.com/vinzenz/libpypa.git

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,15 +71,22 @@ endif()

 execute_process(COMMAND cat llvm_revision.txt WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE LLVMREV OUTPUT_STRIP_TRAILING_WHITESPACE)

-# llvm and clang patches
+# llvm, clang, and libunwind patches
 add_custom_target(llvm_gotorev python ${CMAKE_SOURCE_DIR}/tools/git_svn_gotorev.py ${DEPS_DIR}/llvm-trunk ${LLVMREV} llvm_patches WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 add_custom_target(clang_gotorev python ${CMAKE_SOURCE_DIR}/tools/git_svn_gotorev.py ${DEPS_DIR}/llvm-trunk/tools/clang ${LLVMREV} clang_patches WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 add_custom_target(llvm_up DEPENDS llvm_gotorev clang_gotorev)

+set(LIBUNWIND_PATCHES
+  ${CMAKE_SOURCE_DIR}/libunwind_patches/0001-pyston-add-lots-of-comments.patch
+  ${CMAKE_SOURCE_DIR}/libunwind_patches/0002-pyston-stop-x86_64-setcontext-restoring-uninitialize.patch)
+
 add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/gitmodules
-                   COMMAND git submodule update --init WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+                   COMMAND git submodule update --init
+                   COMMAND python ${CMAKE_SOURCE_DIR}/tools/git_am_automated.py libunwind ${LIBUNWIND_PATCHES}
                   COMMAND cmake -E touch ${CMAKE_BINARY_DIR}/gitmodules
-                   DEPENDS ${CMAKE_SOURCE_DIR}/.gitmodules)
+                   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+                   DEPENDS ${CMAKE_SOURCE_DIR}/.gitmodules
+                   DEPENDS ${LIBUNWIND_PATCHES})
 add_custom_target(gitsubmodules DEPENDS ${CMAKE_BINARY_DIR}/gitmodules)

 # llvm
@@ -100,13 +107,17 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
  set(LIBUNWIND_DEBUG_CFLAGS "CFLAGS=-O0 -g")
  set(LIBUNWIND_DEBUG "--enable-debug")
  set(LIBUNWIND_DEBUG_FRAME "--enable-debug-frame")
+  set(LIBUNWIND_CONSERVATIVE_CHECKS "--enable-conservative-checks")
+else()
+  set(LIBUNWIND_CONSERVATIVE_CHECKS "--disable-conservative-checks")
 endif()
 ExternalProject_Add(libunwind
                    PREFIX libunwind
                    SOURCE_DIR ${CMAKE_SOURCE_DIR}/libunwind
+                    # TODO: more accurate DEPENDS - should depend on *contents* of libunwind/ source directory
                    DEPENDS gitsubmodules
                    UPDATE_COMMAND autoreconf -i
-                    CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/libunwind/configure ${LIBUNWIND_DEBUG_CFLAGS} --prefix=${CMAKE_BINARY_DIR}/libunwind --enable-shared=0 ${LIBUNWIND_DEBUG} ${LIBUNWIND_DEBUG_FRAME}
+                    CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/libunwind/configure ${LIBUNWIND_DEBUG_CFLAGS} --prefix=${CMAKE_BINARY_DIR}/libunwind --enable-shared=0 --disable-block-signals ${LIBUNWIND_CONSERVATIVE_CHECKS} ${LIBUNWIND_DEBUG} ${LIBUNWIND_DEBUG_FRAME}
                    LOG_UPDATE ON
                    LOG_CONFIGURE ON
                    LOG_BUILD ON

--- a/Makefile
+++ b/Makefile
+# prints variables for debugging
+print-%: ; @echo $($*)
+
 # Disable builtin rules:
 .SUFFIXES:

@@ -955,11 +958,11 @@ CMAKE_SHAREDMODS := sharedmods ext_pyston

 .PHONY: pyston_dbg pyston_release
 pyston_dbg: $(CMAKE_SETUP_DBG)
-	$(NINJA) -C $(HOME)/pyston-build-dbg pyston copy_stdlib copy_libpyston $(CMAKE_SHAREDMODS) ext_cpython $(NINJAFLAGS)
-	ln -sf $(HOME)/pyston-build-dbg/pyston pyston_dbg
+	$(NINJA) -C $(CMAKE_DIR_DBG) pyston copy_stdlib copy_libpyston $(CMAKE_SHAREDMODS) ext_cpython $(NINJAFLAGS)
+	ln -sf $(CMAKE_DIR_DBG)/pyston pyston_dbg
 pyston_release: $(CMAKE_SETUP_RELEASE)
-	$(NINJA) -C $(HOME)/pyston-build-release pyston copy_stdlib copy_libpyston $(CMAKE_SHAREDMODS) ext_cpython $(NINJAFLAGS)
-	ln -sf $(HOME)/pyston-build-release/pyston pyston_release
+	$(NINJA) -C $(CMAKE_DIR_RELEASE) pyston copy_stdlib copy_libpyston $(CMAKE_SHAREDMODS) ext_cpython $(NINJAFLAGS)
+	ln -sf $(CMAKE_DIR_RELEASE)/pyston pyston_release
 endif
 CMAKE_DIR_GCC := $(HOME)/pyston-build-gcc
 CMAKE_SETUP_GCC := $(CMAKE_DIR_GCC)/build.ninja

--- a/docs/EXCEPTION-SAFETY.md
+++ b/docs/EXCEPTION-SAFETY.md
+# Using exceptions safely in Pyston
+
+In addition to following general best practices for writing exception-safe C++, when writing Pyston there are a few special rules (because it has a custom unwinder):
+
+1. **Only throw `ExcInfo` values.** All Pyston exceptions are of type `ExcInfo`, which represents a Python exception. In fact, usually you should never `throw`; instead, call `raiseRaw`, `raiseExc`, `raise3`, or similar.
+
+2. **Always catch by value.** That is, always write:
+
+   ```c++
+   try { ... } catch (ExcInfo e) { ... } // Do this!
+   ```
+
+   And **never** write:
+
+   ```c++
+   try { ... } catch (ExcInfo& e) { ... } // DO NOT DO THIS!
+   ```
+
+   The reason for this has to do with the way exceptions are stored in thread-local storage in Pyston; see `docs/UNWINDING.md` for the gory details.
+
+3. **Never rethrow with bare `throw;`.** Instead, write `throw e;`, where `e` is the exception you caught previously.
+
+4. **Never invoke the GC from a destructor.** The GC is not currently aware of the place the exception-currently-being-unwound is stored. Invoking the GC from a destructor might collect the exception, producing a use-after-free bug!
+
+5. **Never throw an exception inside a destructor.** This is a general rule in C++ anyways, but worth reiterating here. In fact, don't even invoke code that *throws an exception but handles it*! This, again, has to do with the way exceptions are stored.
+
+6. **Don't throw exceptions inside signal handlers.** It should be okay if you throw an exception and *always* catch it inside the handler, but I haven't tested this. In theory the exception should just unwind through the signal frame, and libunwind will take care of resetting the signal mask. However, as this codepath hasn't been tested, it's best avoided.
+
+Most of these restrictions could be eliminated in principle. See `docs/UNWINDING.md` for the gory details.
--- a/docs/UNWINDING.md
+++ b/docs/UNWINDING.md
+# The Pyston Unwinder
+
+Pyston uses a custom exception unwinder, replacing the general-purpose C++ unwinder provided by `libstdc++` and `libgcc`. We do this for two reasons:
+
+1. **Efficiency**. The default clang/gcc C++ unwinder is slow, because it needs to support features we don't (such as two-phase unwinding, and having multiple exception types) and because it isn't optimized for speed (C++ assumes exceptions are uncommon).
+
+2. **Customizability**. For example, Python handles backtraces differently than C++ does; with a custom unwinder, we can support Python-style backtraces more easily.
+
+The custom unwinder is in `src/runtime/cxx_unwind.cpp`.
+
+### Useful references on C++ exception handling
+
+- [https://monoinfinito.wordpress.com/series/exception-handling-in-c/](): Good overview of C++ exceptions.
+- [http://www.airs.com/blog/archives/460](): Covers dirty details of `.eh_frame`.
+- [http://www.airs.com/blog/archives/464](): Covers dirty details of the personality function and the LSDA.
+
+# How normal C++ unwinding works
+
+The big picture is that when an exception is thrown, we walk the stack *twice*:
+
+1. In the first phase, we look for a `catch`-block whose type matches the thrown exception. If we don't find one, we terminate the process.
+
+2. In the second phase, we unwind up to the `catch`-block we found; along the way we run any intervening `finally` blocks or RAII destructors.
+
+The purpose of the two-phase search is to make sure that *exceptions that won't be caught terminate the process immediately with a full stack-trace*. In Pyston we don't care about this --- stack traces work differently for us anyway.
+
+## How normal C++ unwinding works, in detail
+
+### Throwing
+
+C++ `throw` statements are translated into a pair of method calls:
+
+1. A call to `void *__cxxabiv1::__cxa_allocate_exception(size_t)` allocates space for an exception of the given size.
+
+2. A call to `void __cxxabiv1::__cxa_throw(void *exc_obj, std::type_info *type_info, void (*dtor)(void*))` invokes the stack unwinder. `exc_obj` is the exception to be thrown; `type_info` is the RTTI for the exception's class, and `dtor` is a callback that (I think) is called to destroy the exception object.
+
+These methods (and others in the `__cxxabiv1` namespace) are defined in `libstdc++`. `__cxa_throw` invokes the generic (non-C++-specific) unwinder by calling `_Unwind_RaiseException()`. This function (and others prefixed with `_Unwind`) are defined in `libgcc`. The details of the libgcc unwinder's interface are less important, and I omit them here.
+
+### Unwinding and .eh_frame
+
+The libgcc unwinder walks the call frame stack, looking up debug information about each function it unwinds through. It finds the debug information by searching for the instruction pointer that would be returned-to in a list of tables; one table for each loaded object (in the linker-and-loader sense of "object", i.e. executable file or shared library). For a given object, the debug info is in a section called `.eh_frame`. See [this blog post](http://www.airs.com/blog/archives/460) for more on the format of `.eh_frame`.
+
+In particular, the unwinder checks whether the function has an associated "personality function", and calls it if it does. If there's no personality function, unwinding continues as normal. C functions do not have personality functions. C++ functions have the personality function `__gxx_personality_v0`, or (if they don't involve exceptions or RAII at all) no personality function.
+
+The job of the personality function is to:
+
+1. Determine what action, if any, needs to happen when unwinding this exception through this frame.
+
+2. If we are in Phase 1, or if there is no action to be taken, report this information to the caller.
+
+3. If we are in Phase 2, actually take the relevant action: jump into the relevant cleanup code, `finally`, or `catch` block. In this case, the personality function does not return.
+
+### The LSDA, landing pads and switch values: how the personality function works
+
+The personality function determines what to do by comparing the instruction pointer being unwound through against C++-specific unwinding information. This is contained in an area of `.eh_frame` called the LSDA (Language-Specific Data Area). See [this blog post](http://www.airs.com/blog/archives/464) for a detailed run-down.
+
+If the personality function finds a "special" action to perform when unwinding, it is associated with two values:
+
+- The *landing pad*, a code address, determined by the instruction pointer value.
+- The *switch value*, an `int64_t`. This is *zero* if we're running cleanup code (RAII destructors or a `finally` block); otherwise it is an index that indicates *which* `catch` block we've matched (since there may be several `catch` blocks covering the code region we're unwinding through).
+
+If we're in phase 2, the personality function then jumps to the landing pad, after (a) restoring execution state for this call frame and (b) storing the exception object pointer and the switch value in specific registers (`RAX` and `RDX` respectively). The code at the landing pad is emitted by the C++ compiler as part of the function being unwound through, and it dispatches on the switch value to determine what code to actually run.
+
+It dispatches to code in one of two flavors: *cleanup code* (`finally` blocks and RAII destructors), or *handler code* (`catch` blocks).
+
+#### Cleanup code (`finally`/RAII)
+
+Cleanup code does what you'd expect: calls the appropriate destructors and/or runs the code in the appropriate `finally` block. It may also call `__cxa_end_catch()`, if we are unwinding out of a catch block - think of `__cxa_begin_catch()` and `__cxa_end_catch()` as like RAII constructor/destructor pairs; the latter is guaranteed to get called when leaving a catch block, whether normally or by exception.
+
+After this is done, it calls `_Unwind_Resume()` to resume unwinding, passing it the exception object pointer that it received in `RAX` when the personality function jumped to the landing pad.
+
+#### Handler code (`catch`)
+
+Handler code, first of all, may *also* call RAII destructors or other cleanup code if necessary. After that, it *may* call `__cxa_get_exception_ptr` with the exception object pointer. I'm not sure why it does this, but it expects `__cxa_get_exception_ptr` to also *return* a pointer to the exception object, so it's effectively a no-op. (I think in a normal C++ unwinder maybe there's an exception *header* as well, and some pointer arithmetic going on, so that the pointer passed in `RAX` to the landing pad and the exception object itself are different?)
+
+After this, it calls `__cxa_begin_catch()` with the exception object pointer. Again, `__cxa_begin_catch()` is expected to return the exception object pointer, so in Pyston this is basically a no-op. (Again, maybe there's some funky pointer arithmetic going on in regular C++ unwinding - I'm not sure.)
+
+Then, *if* the exception is caught by-value (`catch (ExcInfo e)`) rather than by-reference (`catch (ExcInfo& e)`) - and Pyston must *always* catch by value - it copies the exception object onto the stack.
+
+Then it runs the code inside the catch block, like you'd expect.
+
+Finally, it calls `__cxa_end_catch()` (which takes no arguments). In regular C++ this destroys the current exception if appropriate. (It grabs the exception out of some thread-specific data structure that I don't fully understand.)
+
+# How our unwinder is different
+
+We use `libunwind` to deal with a lot of the tedious gruntwork (restoring register state, etc.) of unwinding.
+
+First, we dispense with two-phase unwinding. It's slow and Python tracebacks work differently anyway. (Currently we grab tracebacks before we start unwinding; in the future, we ought to generate them incrementally *as* we unwind.)
+
+Second, we allocate exceptions using a thread-local variable, rather than `malloc()`. By ensuring that only one exception is ever active on a given thread at a given time, this lets us be more efficient. However, we have not measured the performance improvement here; it may be negligible.
+
+Third, when unwinding, we only check whether a function *has* a personality function. If it does, we assert that it is `__gxx_personality_v0`, but we *do not call it*. Instead, we run our own custom dispatch code. We do this because:
+
+1. One argument to the personality function is the current unwind context, in a `libgcc`-specific format. libunwind uses a different format, so we *can't* call it.
+
+2. It avoids an unnecessary indirect call.
+
+3. The personality function checks the exception's type against `catch`-block types. All Pyston exceptions have the same type, so this is unnecessary.
+
+## Functions we override
+- `std::terminate`
+- `__gxx_personality_v0`: stubbed out, should never be called
+- `_Unwind_Resume`
+- `__cxxabiv1::__cxa_allocate_exception`
+- `__cxxabiv1::__cxa_begin_catch`
+- `__cxxabiv1::__cxa_end_catch`
+- `__cxxabiv1::__cxa_throw`
+- `__cxxabiv1::__cxa_rethrow`: stubbed out, we never rethrow directly
+- `__cxxabiv1::__cxa_get_exception_ptr`
+
+# Future work
+
+## Incremental traceback generation
+
+Python tracebacks include only the area of the stack between where the exception was originally raised and where it gets caught. Currently we generate tracebacks (via `getTraceback`) using `unwindPythonStack()` in `src/codegen/unwinding.cpp`, which unwinds the whole stack at once.
+
+Instead we ought to generate them *as we unwind*. This should be a straightforward matter of taking the code in `unwindPythonStack` and integrating it into `unwind_loop` (in `src/runtime/cxx_unwind.cpp`), so that we keep a "current traceback" object that we update as we unwind the stack and discover Python frames.
+
+## Binary search in libunwind
+
+Libunwind, like libgcc, keeps a linked list of objects (executables, shared libraries) to search for debug info. Since it's a linked list, if it's very long we can't find debug info efficiently; a better way would be to keep an array sorted by the start address of the object (since objects are non-overlapping). This comes up in practice because LLVM JITs each function as a separate object.
+
+libunwind's linked list is updated in `_U_dyn_register` (in `libunwind/src/mi/dyn-register.c`) and scanned in `local_find_proc_info` (in `libunwind/src/mi/Gfind_dynamic_proc_info.c`) (and possibly elsewhere).
+
+## GC awareness
+
+Currently we store exceptions-being-unwound in a thread-local variable, `pyston::exception_ferry` (in `src/runtime/cxx_unwind.cpp`). This is invisible to the GC. This *should* be fine, since this variable is only relevant during unwinding, and unwinding *should not* trigger the GC. `catch`-block code might, but as long as we catch by-value (`catch (ExcInfo e)` rather than `catch (ExcInfo& e)`), the relevant pointers will be copied to our stack (thus GC-visible) before any catch-block code is run. The only other problem is if *destructors* can cause GC, since destructors *are* called during unwinding and there's nothing we can do about that. So don't do that!
+
+It wouldn't be too hard to make the GC aware of `pyston::exception_ferry`. We could either:
+- add code to the GC that regards `pyston::exception_ferry` as a source of roots, OR
+- store the exception ferry in `cur_thread_state` instead of its own variable, and update `ThreadStateInternal::accept`
+
+HOWEVER, there's a problem: if we do this, we need to *zero out* the exception ferry at the appropriate time (to avoid keeping an exception alive after it ought to be garbage), and this is harder than it seems. We can't zero it out in `__cxa_begin_catch`, because it's only *after* `__cxa_begin_catch` returns that the exception is copied to the stack. We can't zero it in `__cxa_end_catch`, because `__cxa_end_catch` is called *even if exiting a catch block due to an exception*, so we'd wipe an exception that we actually wanted to propagate!
+
+So this is tricky.
+
+## Decrementing IC counts when unwinding through ICs
+
+To do this, we need some way to tell when we're unwinding through an IC. Keeping a global map from instruction-ranges to IC information should suffice. Then we just check and update this map inside of `unwind_loop`. This might slow us down a bit, but it's probably negligible; worth measuring, though.
+
+Alternatively, there might be some way to use the existing cleanup-code support in the unwinder to do this. That would involve generating EH-frames on the fly, but we already do this! So probably we'd just need to generate more complicated EH frames.
--- a/from_cpython/Include/dictobject.h
+++ b/from_cpython/Include/dictobject.h
@@ -94,7 +94,8 @@ struct _dictobject {
 #endif
 typedef struct {
    PyObject_HEAD;
-    char _filler[48];
+    char _filler[48];           // gcc 4.8
+    // char _filler[56];           // gcc 4.9
 } PyDictObject;

 // Pyston change: these are no longer static objects:

--- a/from_cpython/Include/object.h
+++ b/from_cpython/Include/object.h
@@ -454,7 +454,9 @@ struct _typeobject {

    void* _hcls;
    void* _hcattrs;
-    char _dep_getattrs[56]; // FIXME: this is hardcoding the size of this particular implementation of std::unordered_map
+    // FIXME: this is hardcoding the size of this particular implementation of std::unordered_map
+    char _dep_getattrs[56];     // gcc 4.8
+    // char _dep_getattrs[64];     // gcc 4.9
    char _ics[32];
    void* _gcvisit_func;
    void* _dtor;

--- a/libunwind_patches/0001-Change-the-RBP-validation-heuristic-to-allow-size-0-.patch
+++ b/libunwind_patches/0001-Change-the-RBP-validation-heuristic-to-allow-size-0-.patch
-From e1d7c78d95e4b73a311f10149d0a54547d307d5d Mon Sep 17 00:00:00 2001
-From: Kevin Modzelewski <kmod@dropbox.com>
-Date: Tue, 22 Apr 2014 15:50:40 -0700
-Subject: [PATCH] Change the RBP-validation heuristic to allow size-0 call frames
-
---
- include/libunwind.h |    3 +++
- src/x86_64/Gstep.c  |    2 +-
- 2 files changed, 4 insertions(+), 1 deletions(-)
-
-diff --git a/include/libunwind.h b/include/libunwind.h
-index d11c823..d9a5f03 100644
--- a/include/libunwind.h
-+++ b/include/libunwind.h
-@@ -30,3 +30,6 @@
- # include "libunwind-x86_64.h"
- 
- #endif /* UNW_REMOTE_ONLY */
-+
-+#define LIBUNWIND_PYSTON_PATCH_VERSION 0x01
-+
-diff --git a/src/x86_64/Gstep.c b/src/x86_64/Gstep.c
-index 9fa0967..809d60b 100644
--- a/src/x86_64/Gstep.c
-+++ b/src/x86_64/Gstep.c
-@@ -173,7 +173,7 @@ unw_step (unw_cursor_t *cursor)
- 		 anything about new RBP (rbp1) since it may not be a frame
- 		 pointer in the frame above.  Just check we get the value. */
-               if (ret < 0
-		  || rbp <= c->dwarf.cfa
-+		  || rbp < c->dwarf.cfa
- 		  || (rbp - c->dwarf.cfa) > 0x4000)
- 	        {
-                   rip_loc = DWARF_NULL_LOC;
-- 
-1.7.4.1
-
--- a/libunwind_patches/0001-pyston-add-lots-of-comments.patch
+++ b/libunwind_patches/0001-pyston-add-lots-of-comments.patch
+From 851b35ec5f1e27273fcf271e94364ced31baa2b5 Mon Sep 17 00:00:00 2001
+From: Michael Arntzenius <daekharel@gmail.com>
+Date: Mon, 18 May 2015 17:47:38 -0700
+Subject: [PATCH 1/2] pyston: add lots of comments
+
+---
+ src/dwarf/Gparser.c     | 3 +++
+ src/mi/Gdyn-extract.c   | 3 ++-
+ src/mi/Gget_proc_name.c | 3 +++
+ src/x86_64/Gstep.c      | 2 +-
+ 4 files changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/src/dwarf/Gparser.c b/src/dwarf/Gparser.c
+index fefd809..a5dd943 100644
+--- a/src/dwarf/Gparser.c
+++ b/src/dwarf/Gparser.c
+@@ -429,6 +429,7 @@ fetch_proc_info (struct dwarf_cursor *c, unw_word_t ip, int need_unwind_info)
+   memset (&c->pi, 0, sizeof (c->pi));
+ 
+   /* check dynamic info first --- it overrides everything else */
+  /* TODO rntz: this might be slow when there are lots of jitted functions */
+   ret = unwi_find_dynamic_proc_info (c->as, ip, &c->pi, need_unwind_info,
+ 				     c->as_arg);
+   if (ret == -UNW_ENOINFO)
+@@ -543,6 +544,7 @@ get_rs_cache (unw_addr_space_t as, intrmask_t *saved_maskp)
+       lock_acquire (&cache->lock, *saved_maskp);
+     }
+ 
+  /* XXX rntz: this looks dangerous. why does it need to be atomic? */
+   if (atomic_read (&as->cache_generation) != atomic_read (&cache->generation))
+     {
+       flush_rs_cache (cache);
+@@ -921,6 +923,7 @@ dwarf_create_state_record (struct dwarf_cursor *c, dwarf_state_record_t *sr)
+ HIDDEN int
+ dwarf_make_proc_info (struct dwarf_cursor *c)
+ {
+    /* TODO rntz: why is this #if 0'd? does the cache not work? check upstream. */
+ #if 0
+   if (c->as->caching_policy == UNW_CACHE_NONE
+       || get_cached_proc_info (c) < 0)
+diff --git a/src/mi/Gdyn-extract.c b/src/mi/Gdyn-extract.c
+index 5258839..12ba31f 100644
+--- a/src/mi/Gdyn-extract.c
+++ b/src/mi/Gdyn-extract.c
+@@ -33,7 +33,8 @@ unwi_extract_dynamic_proc_info (unw_addr_space_t as, unw_word_t ip,
+   pi->start_ip = di->start_ip;
+   pi->end_ip = di->end_ip;
+   pi->gp = di->gp;
+-  pi->format = di->format;
+  pi->format = di->format;      /* XXX rntz: is this wrong? */
+  /* This is the point at which we can end up knowing we'll return a non-UNW_INFO_FORMAT_DYNAMIC unwind_info field. */
+   switch (di->format)
+     {
+     case UNW_INFO_FORMAT_DYNAMIC:
+diff --git a/src/mi/Gget_proc_name.c b/src/mi/Gget_proc_name.c
+index 7251c59..485346c 100644
+--- a/src/mi/Gget_proc_name.c
+++ b/src/mi/Gget_proc_name.c
+@@ -55,9 +55,12 @@ get_proc_name (unw_addr_space_t as, unw_word_t ip,
+ 
+   buf[0] = '\0';	/* always return a valid string, even if it's empty */
+ 
+  /* FIXME rntz: this ends up copying a dwarf_cie_info pi.unwind_info,
+   * and then reading it back as an unw_dyn_info_t! */
+   ret = unwi_find_dynamic_proc_info (as, ip, &pi, 1, arg);
+   if (ret == 0)
+     {
+      assert(pi.format == UNW_INFO_FORMAT_DYNAMIC); /* FIXME rntz: handle this being false. */
+       unw_dyn_info_t *di = pi.unwind_info;
+ 
+       if (offp)
+diff --git a/src/x86_64/Gstep.c b/src/x86_64/Gstep.c
+index 809d60b..e4312af 100644
+--- a/src/x86_64/Gstep.c
+++ b/src/x86_64/Gstep.c
+@@ -158,7 +158,7 @@ unw_step (unw_cursor_t *cursor)
+ 	    }
+ 	  else
+ 	    {
+-	      unw_word_t rbp1 = 0;
+	      unw_word_t rbp1 = 0; /* might want to put an assert here to check for guessing */
+ 	      rbp_loc = DWARF_LOC(rbp, 0);
+ 	      rsp_loc = DWARF_NULL_LOC;
+ 	      rip_loc = DWARF_LOC (rbp + 8, 0);
+-- 
+2.1.0
+
--- a/libunwind_patches/0002-pyston-stop-x86_64-setcontext-restoring-uninitialize.patch
+++ b/libunwind_patches/0002-pyston-stop-x86_64-setcontext-restoring-uninitialize.patch
+From 3faf9111fa09e26209eb01091a8ad61c28ae6197 Mon Sep 17 00:00:00 2001
+From: Michael Arntzenius <daekharel@gmail.com>
+Date: Tue, 19 May 2015 14:11:27 -0700
+Subject: [PATCH 2/2] pyston: stop x86_64 setcontext() restoring
+ (uninitialized) signal mask
+
+---
+ src/x86_64/setcontext.S | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/src/x86_64/setcontext.S b/src/x86_64/setcontext.S
+index 1af8b67..7bee005 100644
+--- a/src/x86_64/setcontext.S
+++ b/src/x86_64/setcontext.S
+@@ -47,6 +47,14 @@ _Ux86_64_setcontext:
+ #if defined __linux__
+ 	/* restore signal mask
+            sigprocmask(SIG_SETMASK, ucp->uc_sigmask, NULL, sizeof(sigset_t)) */
+
+	/* PYSTON CHANGE: for some reason, libunwind restores the signal mask
+	 * in _Ux86_64_setcontext() even though _Ux86_64_getcontext doesn't
+	 * initialize it! This sets our signal mask to random stack garbage,
+	 * so I've commented it out. - rntz
+	 */
+
+	/*
+ 	push %rdi
+ 	mov $__NR_rt_sigprocmask, %rax
+ 	lea UC_SIGMASK(%rdi), %rsi
+@@ -55,21 +63,29 @@ _Ux86_64_setcontext:
+ 	mov $SIGSET_BYTE_SIZE, %r10
+ 	syscall
+ 	pop %rdi
+	*/
+ 
+         /* restore fp state */
+ 	mov    UC_MCONTEXT_FPREGS_PTR(%rdi),%r8
+ 	fldenv (%r8)
+ 	ldmxcsr FPREGS_OFFSET_MXCSR(%r8)
+ #elif defined __FreeBSD__
+	/* PYSTON CHANGE */
+#error Pyston doesn't support FreeBSD yet.
+ 	/* restore signal mask */
+	/* PYSTON CHANGE: Commented out for same reason as the linux code
+	 * above, but I haven't tested this one. Use at your own risk. - rntz
+	 */
+	/*
+ 	pushq	%rdi
+ 	xorl	%edx,%edx
+ 	leaq	UC_SIGMASK(%rdi),%rsi
+-	movl	$3,%edi/* SIG_SETMASK */
+	movl	$3,%edi/\* SIG_SETMASK *\/
+ 	movl	$SYS_sigprocmask,%eax
+ 	movq	%rcx,%r10
+ 	syscall
+ 	popq	%rdi
+	*/
+ 
+ 	/* restore fp state */
+ 	cmpq $UC_MCONTEXT_FPOWNED_FPU,UC_MCONTEXT_OWNEDFP(%rdi)
+-- 
+2.1.0
+
--- a/microbenchmarks/exceptions_2_ubench.py
+++ b/microbenchmarks/exceptions_2_ubench.py
+NUM_ITERS = 100 * 1000
+WRAPPER_DEPTH = 10
+RECURSE_DEPTH = 0
+TRACEBACK_DEPTH = 0
+
+counter = 0
+
+def gtor():
+    yield 1
+    raise Exception('bad wrong')
+    yield 2
+
+def wrapper(n=WRAPPER_DEPTH):
+    global counter
+    if n:
+        try:
+            wrapper(n-1)
+        finally:
+            counter += 1
+    else:
+        for x in gtor():
+            pass
+
+def recurser(n=RECURSE_DEPTH):
+    if n:
+        return recurser(n-1)
+    else:
+        return wrapper()
+
+def f(niters, traceback_depth=TRACEBACK_DEPTH):
+    global counter
+    if traceback_depth:
+        f(niters, traceback_depth - 1)
+    else:
+        for i in xrange(niters):
+            try:
+                recurser()
+            except Exception:
+                counter = 0
+
+f(NUM_ITERS)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -68,6 +68,7 @@ add_library(PYSTON_OBJECTS OBJECT ${OPTIONAL_SRCS}
 		core/stats.cpp
 		core/threading.cpp
 		core/util.cpp
+		deadlock_debug_helper.cpp
 		gc/collector.cpp
 		gc/gc_alloc.cpp
 		gc/heap.cpp
@@ -83,6 +84,7 @@ add_library(PYSTON_OBJECTS OBJECT ${OPTIONAL_SRCS}
 		runtime/code.cpp
 		runtime/complex.cpp
 		runtime/ctxswitching.S
+		runtime/cxx_unwind.cpp
 		runtime/descr.cpp
 		runtime/dict.cpp
 		runtime/file.cpp

--- a/src/asm_writing/icinfo.h
+++ b/src/asm_writing/icinfo.h
@@ -90,6 +90,7 @@ private:
    // This is probably a bunch worse than LRU, but it's also
    // probably a bunch better than the "always evict slot #0" policy
    // that it's replacing.
+    // TODO: experiment with different IC eviction strategies.
    int next_slot_to_try;

    const StackInfo stack_info;

--- a/src/codegen/ast_interpreter.cpp
+++ b/src/codegen/ast_interpreter.cpp
@@ -76,7 +76,12 @@ public:

    void initArguments(int nargs, BoxedClosure* closure, BoxedGenerator* generator, Box* arg1, Box* arg2, Box* arg3,
                       Box** args);
-    static Value execute(ASTInterpreter& interpreter, CFGBlock* start_block = NULL, AST_stmt* start_at = NULL);
+
+    // This must not be inlined, because we rely on being able to detect when we're inside of it (by checking whether
+    // %rip is inside its instruction range) during a stack-trace in order to produce tracebacks inside interpreted
+    // code.
+    __attribute__((__no_inline__)) static Value
+        execute(ASTInterpreter& interpreter, CFGBlock* start_block = NULL, AST_stmt* start_at = NULL);

 private:
    Box* createFunction(AST* node, AST_arguments* args, const std::vector<AST_stmt*>& body);
@@ -274,6 +279,9 @@ void ASTInterpreter::initArguments(int nargs, BoxedClosure* _closure, BoxedGener
    }
 }

+// Map from stack frame pointers for frames corresponding to ASTInterpreter::execute() to the ASTInterpreter handling
+// them. Used to look up information about that frame. This is used for getting tracebacks, for CPython introspection
+// (sys._getframe & co), and for GC scanning.
 static std::unordered_map<void*, ASTInterpreter*> s_interpreterMap;
 static_assert(THREADING_USE_GIL, "have to make the interpreter map thread safe!");


--- a/src/codegen/irgen/irgenerator.cpp
+++ b/src/codegen/irgen/irgenerator.cpp
@@ -1855,6 +1855,7 @@ private:
        static const std::string newline_str("\n");
        static const std::string space_str(" ");

+        // TODO: why are we inline-generating all this code instead of just emitting a call to some runtime function?
        int nvals = node->values.size();
        for (int i = 0; i < nvals; i++) {
            CompilerVariable* var = evalExpr(node->values[i], unw_info);

--- a/src/codegen/unwinding.cpp
+++ b/src/codegen/unwinding.cpp
@@ -59,14 +59,18 @@ namespace pyston {

 // Parse an .eh_frame section, and construct a "binary search table" such as you would find in a .eh_frame_hdr section.
 // Currently only supports .eh_frame sections with exactly one fde.
-void parseEhFrame(uint64_t start_addr, uint64_t size, uint64_t* out_data, uint64_t* out_len) {
+// See http://www.airs.com/blog/archives/460 for some useful info.
+void parseEhFrame(uint64_t start_addr, uint64_t size, uint64_t func_addr, uint64_t* out_data, uint64_t* out_len) {
+    // NB. according to sully@msully.net, this is not legal C++ b/c type-punning through unions isn't allowed.
+    // But I can't find a compiler flag that warns on it, and it seems to work.
    union {
        uint8_t* u8;
        uint32_t* u32;
    };
    u32 = (uint32_t*)start_addr;

-    int cie_length = *u32;
+    int32_t cie_length = *u32;
+    assert(cie_length != 0xffffffff); // 0xffffffff would indicate a 64-bit DWARF format
    u32++;

    assert(*u32 == 0); // CIE ID
@@ -80,13 +84,37 @@ void parseEhFrame(uint64_t start_addr, uint64_t size, uint64_t* out_data, uint64

    int nentries = 1;
    uw_table_entry* table_data = new uw_table_entry[nentries];
-    table_data->start_ip_offset = 0;
+    table_data->start_ip_offset = func_addr - start_addr;
    table_data->fde_offset = 4 + cie_length;

    *out_data = (uintptr_t)table_data;
    *out_len = nentries;
 }

+void registerDynamicEhFrame(uint64_t code_addr, size_t code_size, uint64_t eh_frame_addr, size_t eh_frame_size) {
+    unw_dyn_info_t* dyn_info = new unw_dyn_info_t();
+    dyn_info->start_ip = code_addr;
+    dyn_info->end_ip = code_addr + code_size;
+    // TODO: It's not clear why we use UNW_INFO_FORMAT_REMOTE_TABLE instead of UNW_INFO_FORMAT_TABLE. kmod reports that
+    // he tried FORMAT_TABLE and it didn't work, but it wasn't clear why. However, using FORMAT_REMOTE_TABLE forces
+    // indirection through an access_mem() callback, and indeed, a function named access_mem() shows up in our `perf`
+    // results! So it's possible there's a performance win lurking here.
+    dyn_info->format = UNW_INFO_FORMAT_REMOTE_TABLE;
+
+    dyn_info->u.rti.name_ptr = 0;
+    dyn_info->u.rti.segbase = eh_frame_addr;
+    parseEhFrame(eh_frame_addr, eh_frame_size, code_addr, &dyn_info->u.rti.table_data, &dyn_info->u.rti.table_len);
+
+    if (VERBOSITY() >= 2)
+        printf("dyn_info = %p, table_data = %p\n", dyn_info, (void*)dyn_info->u.rti.table_data);
+    _U_dyn_register(dyn_info);
+
+    // TODO: it looks like libunwind does a linear search over anything dynamically registered,
+    // as opposed to the binary search it can do within a dyn_info.
+    // If we're registering a lot of dyn_info's, it might make sense to coalesce them into a single
+    // dyn_info that contains a binary search table.
+}
+
 class CFRegistry {
 private:
    std::vector<CompiledFunction*> cfs;
@@ -156,55 +184,59 @@ public:

        assert(g.cur_cf);

-        llvm_error_code ec;
+        uint64_t func_addr = 0; // remains 0 until we find a function
+
+        // Search through the symbols to find the function that got JIT'ed.
+        // (We only JIT one function at a time.)
        for (const auto& sym : Obj.symbols()) {
            llvm::object::SymbolRef::Type SymType;
-            if (sym.getType(SymType))
+            if (sym.getType(SymType) || SymType != llvm::object::SymbolRef::ST_Function)
+                continue;
+
+            llvm::StringRef Name;
+            uint64_t Size;
+            if (sym.getName(Name) || sym.getSize(Size))
                continue;
-            if (SymType == llvm::object::SymbolRef::ST_Function) {
-                llvm::StringRef Name;
-                uint64_t Addr;
-                uint64_t Size;
-                if (sym.getName(Name))
-                    continue;
-                Addr = L.getSymbolLoadAddress(Name);
-                assert(Addr);
-                if (sym.getSize(Size))
-                    continue;
+
+            // Found a function!
+            assert(!func_addr);
+            func_addr = L.getSymbolLoadAddress(Name);
+            assert(func_addr);

 // TODO this should be the Python name, not the C name:
 #if LLVMREV < 208921
-                llvm::DILineInfoTable lines = Context->getLineInfoForAddressRange(
-                    Addr, Size, llvm::DILineInfoSpecifier::FunctionName | llvm::DILineInfoSpecifier::FileLineInfo
-                                    | llvm::DILineInfoSpecifier::AbsoluteFilePath);
+            llvm::DILineInfoTable lines = Context->getLineInfoForAddressRange(
+                func_addr, Size, llvm::DILineInfoSpecifier::FunctionName | llvm::DILineInfoSpecifier::FileLineInfo
+                                     | llvm::DILineInfoSpecifier::AbsoluteFilePath);
 #else
-                llvm::DILineInfoTable lines = Context->getLineInfoForAddressRange(
-                    Addr, Size, llvm::DILineInfoSpecifier(llvm::DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
-                                                          llvm::DILineInfoSpecifier::FunctionNameKind::LinkageName));
+            llvm::DILineInfoTable lines = Context->getLineInfoForAddressRange(
+                func_addr, Size,
+                llvm::DILineInfoSpecifier(llvm::DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
+                                          llvm::DILineInfoSpecifier::FunctionNameKind::LinkageName));
 #endif
-                if (VERBOSITY() >= 3) {
-                    for (int i = 0; i < lines.size(); i++) {
-                        printf("%s:%d, %s: %lx\n", lines[i].second.FileName.c_str(), lines[i].second.Line,
-                               lines[i].second.FunctionName.c_str(), lines[i].first);
-                    }
+            if (VERBOSITY() >= 3) {
+                for (int i = 0; i < lines.size(); i++) {
+                    printf("%s:%d, %s: %lx\n", lines[i].second.FileName.c_str(), lines[i].second.Line,
+                           lines[i].second.FunctionName.c_str(), lines[i].first);
                }
-
-                assert(g.cur_cf->code_start == 0);
-                g.cur_cf->code_start = Addr;
-                g.cur_cf->code_size = Size;
-                cf_registry.registerCF(g.cur_cf);
            }
+
+            assert(g.cur_cf->code_start == 0);
+            g.cur_cf->code_start = func_addr;
+            g.cur_cf->code_size = Size;
+            cf_registry.registerCF(g.cur_cf);
        }

-        // Currently-unused libunwind support:
-        llvm_error_code code;
+        assert(func_addr);
+
+        // Libunwind support:
        bool found_text = false, found_eh_frame = false;
        uint64_t text_addr = -1, text_size = -1;
        uint64_t eh_frame_addr = -1, eh_frame_size = -1;

        for (const auto& sec : Obj.sections()) {
            llvm::StringRef name;
-            code = sec.getName(name);
+            llvm_error_code code = sec.getName(name);
            assert(!code);

            uint64_t addr, size;
@@ -229,24 +261,9 @@ public:

        assert(found_text);
        assert(found_eh_frame);
+        assert(text_addr == func_addr);

-        unw_dyn_info_t* dyn_info = new unw_dyn_info_t();
-        dyn_info->start_ip = text_addr;
-        dyn_info->end_ip = text_addr + text_size;
-        dyn_info->format = UNW_INFO_FORMAT_REMOTE_TABLE;
-
-        dyn_info->u.rti.name_ptr = 0;
-        dyn_info->u.rti.segbase = eh_frame_addr;
-        parseEhFrame(eh_frame_addr, eh_frame_size, &dyn_info->u.rti.table_data, &dyn_info->u.rti.table_len);
-
-        if (VERBOSITY() >= 2)
-            printf("dyn_info = %p, table_data = %p\n", dyn_info, (void*)dyn_info->u.rti.table_data);
-        _U_dyn_register(dyn_info);
-
-        // TODO: it looks like libunwind does a linear search over anything dynamically registered,
-        // as opposed to the binary search it can do within a dyn_info.
-        // If we're registering a lot of dyn_info's, it might make sense to coalesce them into a single
-        // dyn_info that contains a binary search table.
+        registerDynamicEhFrame(text_addr, text_size, eh_frame_addr, eh_frame_size);
    }
 };

@@ -513,6 +530,36 @@ static const LineInfo* lineInfoForFrame(PythonFrameIteratorImpl& frame_it) {
    return new LineInfo(current_stmt->lineno, current_stmt->col_offset, source->fn, source->getName());
 }

+// To produce a traceback, we:
+//
+// 1. Use libunwind to produce a cursor into our stack.
+//
+// 2. Grab the next frame in the stack and check what function it is from. There are four options:
+//
+//    (a) A JIT-compiled Python function.
+//    (b) ASTInterpreter::execute() in codegen/ast_interpreter.cpp.
+//    (c) generatorEntry() in runtime/generator.cpp.
+//    (d) Something else.
+//
+//    By cases:
+//
+//    (2a, 2b) If the previous frame we visited was an OSR frame (which we know from its CompiledFunction*), then we
+//    skip this frame (it's the frame we replaced on-stack) and keep unwinding. (FIXME: Why are we guaranteed that we
+//    on-stack-replaced at most one frame?) Otherwise, we found a frame for our traceback! Proceed to step 3.
+//
+//    (2c) Continue unwinding in the stack of whatever called the generator. This involves some hairy munging of
+//    undocumented fields in libunwind structs to swap the context.
+//
+//    (2d) Ignore it and keep unwinding. It's some C or C++ function that we don't want in our traceback.
+//
+// 3. We've found a frame for our traceback, along with a CompiledFunction* and some other information about it.
+//
+//    We grab the current statement it is in (as an AST_stmt*) and use it and the CompiledFunction*'s source info to
+//    produce the line information for the traceback. For JIT-compiled functions, getting the statement involves the
+//    CF's location_map.
+//
+// 4. Unless we've hit the end of the stack, go to 2 and keep unwinding.
+//
 static StatCounter us_gettraceback("us_gettraceback");
 BoxedTraceback* getTraceback() {
    STAT_TIMER(t0, "us_timer_gettraceback");

--- a/src/codegen/unwinding.h
+++ b/src/codegen/unwinding.h
@@ -27,9 +27,12 @@ class BoxedModule;
 class BoxedTraceback;
 struct FrameInfo;

+void registerDynamicEhFrame(uint64_t code_addr, size_t code_size, uint64_t eh_frame_addr, size_t eh_frame_size);
+
 BoxedModule* getCurrentModule();
 Box* getGlobals();     // returns either the module or a globals dict
 Box* getGlobalsDict(); // always returns a dict-like object
+CompiledFunction* getCFForAddress(uint64_t addr);

 BoxedTraceback* getTraceback();


--- a/src/core/common.h
+++ b/src/core/common.h
@@ -41,17 +41,6 @@

 #define ARRAY_LEN(arr) (sizeof(arr) / sizeof((arr)[0]))

-// GCC and clang handle always_inline very differently;
-// we mostly only care about it for the stdlib, so just remove the attributes
-// if we're not in clang
-#ifdef __clang__
-#define ALWAYSINLINE __attribute__((always_inline))
-#define NOINLINE __attribute__((noinline))
-#else
-#define ALWAYSINLINE
-#define NOINLINE
-#endif
-
 #if LLVMREV < 210783
 #define llvm_error_code llvm::error_code
 #else

--- a/src/core/options.h
+++ b/src/core/options.h
@@ -20,7 +20,7 @@ namespace pyston {
 extern "C" {

 extern int GLOBAL_VERBOSITY;
-#define VERBOSITY(x) GLOBAL_VERBOSITY
+#define VERBOSITY(x) pyston::GLOBAL_VERBOSITY
 extern int PYSTON_VERSION_MAJOR, PYSTON_VERSION_MINOR;
 // Version number we're targeting:
 extern int PYTHON_VERSION_MAJOR, PYTHON_VERSION_MINOR, PYTHON_VERSION_MICRO, PYTHON_VERSION_HEX;

--- a/src/core/thread_utils.h
+++ b/src/core/thread_utils.h
@@ -52,11 +52,20 @@ public:

 class PthreadFastMutex {
 private:
+    // NB. I tried using error-checking mutexes (PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP) here in debug-mode but got
+    // some funky errors. I think we might be deliberately locking/unlocking mutexes on different threads in some
+    // circumstances. - rntz
    pthread_mutex_t mutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;

 public:
-    void lock() { pthread_mutex_lock(&mutex); }
-    void unlock() { pthread_mutex_unlock(&mutex); }
+    void lock() {
+        int err = pthread_mutex_lock(&mutex);
+        ASSERT(!err, "pthread_mutex_lock failed, error code %d", err);
+    }
+    void unlock() {
+        int err = pthread_mutex_unlock(&mutex);
+        ASSERT(!err, "pthread_mutex_unlock failed, error code %d", err);
+    }

    PthreadFastMutex* asRead() { return this; }
    PthreadFastMutex* asWrite() { return this; }
@@ -64,11 +73,18 @@ public:

 class PthreadMutex {
 private:
+    // Ditto comment in PthreadFastMutex re error-checking mutexes. - rntz
    pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

 public:
-    void lock() { pthread_mutex_lock(&mutex); }
-    void unlock() { pthread_mutex_unlock(&mutex); }
+    void lock() {
+        int err = pthread_mutex_lock(&mutex);
+        ASSERT(!err, "pthread_mutex_lock failed, error code %d", err);
+    }
+    void unlock() {
+        int err = pthread_mutex_unlock(&mutex);
+        ASSERT(!err, "pthread_mutex_unlock failed, error code %d", err);
+    }

    PthreadMutex* asRead() { return this; }
    PthreadMutex* asWrite() { return this; }

--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -489,6 +489,13 @@ extern "C" void PyEval_ReInitThreads() noexcept {
        }
    }

+    // We need to make sure the threading lock is released, so we unconditionally unlock it. After a fork, we are the
+    // only thread, so this won't race; and since it's a "fast" mutex (see `man pthread_mutex_lock`), this works even
+    // if it isn't locked. If we needed to avoid unlocking a non-locked mutex, though, we could trylock it first:
+    //
+    //     int err = pthread_mutex_trylock(&threading_lock.mutex);
+    //     ASSERT(!err || err == EBUSY, "pthread_mutex_trylock failed, but not with EBUSY");
+    //
    threading_lock.unlock();

    num_starting_threads = 0;

--- a/src/core/types.h
+++ b/src/core/types.h
@@ -689,6 +689,8 @@ struct FrameInfo {
    // In Pyston, exc is the frame-local value of sys.exc_info.
    // - This makes frame entering+leaving faster at the expense of slower exceptions.
    //
+    // TODO: do we want exceptions to be slower? benchmark this!
+    //
    // exc.type is initialized to NULL at function entry, and exc.value and exc.tb are left
    // uninitialized.  When one wants to access any of the values, you need to check if exc.type
    // is NULL, and if so crawl up the stack looking for the first frame with a non-null exc.type

--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@@ -41,8 +41,13 @@ uint64_t getCPUTicks() {
    return rdtsc();
 }

+#if !DISABLE_TIMERS
+
 int Timer::level = 0;

+Timer::Timer(long min_usec) : min_usec(min_usec), ended(true) {
+}
+
 Timer::Timer(const char* desc, long min_usec) : min_usec(min_usec), ended(true) {
    restart(desc);
 }
@@ -101,6 +106,8 @@ Timer::~Timer() {
    }
 }

+#endif // !DISABLE_TIMERS
+
 bool startswith(const std::string& s, const std::string& pattern) {
    if (pattern.size() > s.size())
        return false;

--- a/src/core/util.h
+++ b/src/core/util.h
@@ -26,6 +26,9 @@ namespace pyston {

 uint64_t getCPUTicks();

+#define DISABLE_TIMERS 0
+
+#if !DISABLE_TIMERS
 class Timer {
 private:
    static int level;
@@ -36,7 +39,9 @@ private:
    std::function<void(uint64_t)> exit_callback;

 public:
+    // Timers with non-NULL desc will print times longer than min_usec for debugging when VERBOSITY("time") >= 2
    Timer(const char* desc = NULL, long min_usec = -1);
+    Timer(long min_usec); // doesn't start the timer
    ~Timer();

    void setExitCallback(std::function<void(uint64_t)> _exit_callback) { exit_callback = _exit_callback; }
@@ -56,6 +61,23 @@ public:
    uint64_t getStartTime() const { return start_time; }
 };

+#else // DISABLE_TIMERS
+class Timer {
+public:
+    Timer(const char* desc = NULL, long min_usec = -1) {}
+    Timer(long min_usec) {}
+
+    void setExitCallback(std::function<void(uint64_t)> _exit_callback) {}
+
+    void restart(const char* newdesc, long new_min_usec) {}
+    void restart(const char* newdesc = NULL) {}
+
+    long end() { return 0; }
+    long split(const char* newdesc = NULL) { return 0; }
+};
+
+#endif // #else DISABLE_TIMERS
+
 bool startswith(const std::string& s, const std::string& pattern);
 bool endswith(const std::string& s, const std::string& pattern);


--- a/src/deadlock_debug_helper.cpp
+++ b/src/deadlock_debug_helper.cpp
+// Copyright (c) 2014-2015 Dropbox, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is a hack for debugging deadlocks. It makes pthread_mutex_lock() complain if it takes more than given time
+// (TIMEOUT_S) to grab a lock. Perhaps it will be useful in future.
+
+#if 0 // set to 1 to enable
+
+#include <errno.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "core/common.h"
+
+#define TIMEOUT_S 2
+
+extern "C" int pthread_mutex_lock(pthread_mutex_t* lock) {
+    struct timespec timeout;
+    memset(&timeout, 0, sizeof timeout);
+    timeout.tv_sec = TIMEOUT_S;
+
+    pid_t tid = syscall(SYS_gettid);
+    RELEASE_ASSERT(tid > 1, "negative or invalid TID");
+
+    time_t started = time(NULL);
+    RELEASE_ASSERT(started != (time_t)-1, "could not get time()");
+
+    int err;
+    for (;;) {
+        err = pthread_mutex_timedlock(lock, &timeout);
+        if (err != ETIMEDOUT)
+            break;
+        time_t now = time(NULL);
+        RELEASE_ASSERT(now != (time_t)-1, "could not get time()");
+        if (now - started >= TIMEOUT_S) {
+            printf("%d: mutex %p TIMED OUT\n", tid, (void*)lock);
+            started = now;
+        }
+    }
+    RELEASE_ASSERT(!err, "could not lock mutex, error %d", err);
+    return err;
+}
+
+#endif
--- a/src/runtime/builtin_modules/builtins.cpp
+++ b/src/runtime/builtin_modules/builtins.cpp
@@ -213,7 +213,7 @@ extern "C" Box* next(Box* iterator, Box* _default) {
    } catch (ExcInfo e) {
        if (_default && e.matches(StopIteration))
            return _default;
-        throw;
+        throw e;
    }
 }

@@ -877,6 +877,7 @@ Box* print(BoxedTuple* args, BoxedDict* kwargs) {
    Box* space_box = boxStrConstant(" ");

    // TODO softspace handling?
+    // TODO: duplicates code with ASTInterpreter::visit_print()
    bool first = true;
    for (auto e : *args) {
        BoxedString* s = str(e);

--- a/src/runtime/builtin_modules/thread.cpp
+++ b/src/runtime/builtin_modules/thread.cpp
@@ -88,6 +88,9 @@ Box* startNewThread(Box* target, Box* args, Box* kw) {
 * codes in the return value of the sem_ calls (like the pthread_ functions).
 * Correct implementations return -1 and put the code in errno. This supports
 * either.
+ *
+ * NOTE (2015-05-14): According to `man pthread_mutex_lock` on my system (Ubuntu
+ * 14.10), returning the error code is expected behavior. - rntz
 */
 static int fix_status(int status) {
    return (status == -1) ? errno : status;
@@ -134,6 +137,7 @@ public:

        success = (status == 0) ? 1 : 0;

+        RELEASE_ASSERT(status == 0 || !waitflag, "could not lock mutex! error %d", status);
        return boxBool(status == 0);
    }


--- a/src/runtime/cxx_unwind.cpp
+++ b/src/runtime/cxx_unwind.cpp
+// Copyright (c) 2014-2015 Dropbox, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdlib>
+#include <dlfcn.h> // dladdr
+#include <stddef.h>
+#include <stdint.h>
+#include <unwind.h>
+
+#include "llvm/Support/LEB128.h" // for {U,S}LEB128 decoding
+
+#include "codegen/ast_interpreter.h" // interpreter_instr_addr
+#include "codegen/unwinding.h"       // getCFForAddress
+#include "core/stats.h"              // StatCounter
+#include "core/types.h"              // for ExcInfo
+#include "core/util.h"               // Timer
+#include "runtime/generator.h"       // generatorEntry
+
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+#define PYSTON_CUSTOM_UNWINDER 1 // set to 0 to use C++ unwinder
+
+#define NORETURN __attribute__((__noreturn__))
+
+// canary used in ExcData in debug mode to catch exception-value corruption.
+#define CANARY_VALUE 0xdeadbeef
+
+// An action of 0 in the LSDA action table indicates cleanup.
+#define CLEANUP_ACTION 0
+
+// Dwarf encoding modes.
+#define DW_EH_PE_absptr 0x00
+#define DW_EH_PE_omit 0xff
+
+#define DW_EH_PE_uleb128 0x01
+#define DW_EH_PE_udata2 0x02
+#define DW_EH_PE_udata4 0x03
+#define DW_EH_PE_udata8 0x04
+#define DW_EH_PE_sleb128 0x09
+#define DW_EH_PE_sdata2 0x0A
+#define DW_EH_PE_sdata4 0x0B
+#define DW_EH_PE_sdata8 0x0C
+#define DW_EH_PE_signed 0x08
+
+#define DW_EH_PE_pcrel 0x10
+#define DW_EH_PE_textrel 0x20
+#define DW_EH_PE_datarel 0x30
+#define DW_EH_PE_funcrel 0x40
+#define DW_EH_PE_aligned 0x50
+
+#define DW_EH_PE_indirect 0x80
+// end dwarf encoding modes
+
+extern "C" void __gxx_personality_v0(); // wrong type signature, but that's ok, it's extern "C"
+
+// check(EXPR) is like assert((EXPR) == 0), but evaluates EXPR even in debug mode.
+template <typename T> static inline void check(T x) {
+    assert(x == 0);
+}
+
+namespace pyston {
+
+struct ExcData;
+extern thread_local ExcData exception_ferry;
+
+struct ExcData {
+    ExcInfo exc;
+#ifndef NDEBUG
+    unsigned canary = CANARY_VALUE;
+#endif
+
+    ExcData() : exc(nullptr, nullptr, nullptr) {}
+    ExcData(ExcInfo e) : exc(e) {}
+    ExcData(Box* type, Box* value, Box* traceback) : exc(type, value, traceback) {}
+
+    void check() const {
+        assert(this);
+        assert(canary == CANARY_VALUE);
+        assert(exc.type && exc.value && exc.traceback);
+        assert(gc::isValidGCObject(exc.type) && gc::isValidGCObject(exc.value) && gc::isValidGCObject(exc.traceback));
+        assert(this == &exception_ferry);
+    }
+};
+
+thread_local ExcData exception_ferry;
+
+static_assert(offsetof(ExcData, exc) == 0, "wrong offset");
+
+// Timer that auto-logs.
+struct LogTimer {
+    StatCounter& counter;
+    Timer timer;
+
+    LogTimer(const char* desc, StatCounter& ctr, long min_usec = -1) : counter(ctr), timer(desc, min_usec) {}
+    ~LogTimer() { counter.log(timer.end()); }
+};
+
+static StatCounter us_unwind_loop("us_unwind_loop");
+static StatCounter us_unwind_resume_catch("us_unwind_resume_catch");
+static StatCounter us_unwind_cleanup("us_unwind_cleanup");
+static StatCounter us_unwind_get_proc_info("us_unwind_get_proc_info");
+static StatCounter us_unwind_step("us_unwind_step");
+static StatCounter us_unwind_find_call_site_entry("us_unwind_find_call_site_entry");
+
+// do these need to be separate timers? might as well
+static thread_local Timer per_thread_resume_catch_timer(-1);
+static thread_local Timer per_thread_cleanup_timer(-1);
+#ifndef NDEBUG
+static __thread bool in_cleanup_code = false;
+#endif
+
+extern "C" {
+
+static NORETURN void panic(void) {
+    RELEASE_ASSERT(0, "pyston::panic() called!");
+}
+
+// Highly useful resource: http://www.airs.com/blog/archives/464
+// talks about DWARF LSDA parsing with respect to C++ exception-handling
+struct lsda_info_t {
+    // base which landing pad offsets are relative to
+    const uint8_t* landing_pad_base;
+    const uint8_t* type_table;
+    const uint8_t* call_site_table;
+    const uint8_t* action_table;
+    uint8_t type_table_entry_encoding;      // a DW_EH_PE_xxx value
+    uint8_t call_site_table_entry_encoding; // a DW_EH_PE_xxx value
+};
+
+struct call_site_entry_t {
+    const uint8_t* instrs_start;
+    size_t instrs_len_bytes;
+    const uint8_t* landing_pad; // may be NULL if no landing pad
+    // "plus one" so that 0 can mean "no action". offset is in bytes.
+    size_t action_offset_plus_one;
+};
+
+
+// ---------- Parsing stuff ----------
+static inline void parse_lsda_header(const unw_proc_info_t* pip, lsda_info_t* info) {
+    const uint8_t* ptr = (const uint8_t*)pip->lsda;
+
+    // 1. Read the landing pad base pointer.
+    uint8_t landing_pad_base_encoding = *ptr++;
+    if (landing_pad_base_encoding == DW_EH_PE_omit) {
+        // The common case is to omit. Then the landing pad base is _Unwind_GetRegion(context), which is the start of
+        // the function.
+        info->landing_pad_base = (const uint8_t*)pip->start_ip;
+    } else {
+        RELEASE_ASSERT(0, "we only support omitting the landing pad base");
+    }
+
+    // 2. Read the type table encoding & base pointer.
+    info->type_table_entry_encoding = *ptr++;
+    if (info->type_table_entry_encoding != DW_EH_PE_omit) {
+        // read ULEB128-formatted byte offset from THIS FIELD to the start of the types table.
+        unsigned uleb_size;
+        uint64_t offset = llvm::decodeULEB128(ptr, &uleb_size);
+        // We don't use the type table, and I'm not sure this calculation is correct - it might be an offset from a
+        // different base, I should use gdb to check it against libgcc. So I've set it to nullptr instead.
+        info->type_table = nullptr;
+        // info->type_table = ptr + offset; // <- The calculation I'm not sure of.
+        ptr += uleb_size;
+    } else { // type table omitted
+        info->type_table = nullptr;
+    }
+
+    // 3. Read the call-site encoding & base pointer.
+    info->call_site_table_entry_encoding = *ptr++;
+    unsigned uleb_size;
+    size_t call_site_table_nbytes = llvm::decodeULEB128(ptr, &uleb_size);
+    ptr += uleb_size;
+
+    // The call site table follows immediately after the header.
+    info->call_site_table = ptr;
+    // The action table follows immediately after the call site table.
+    info->action_table = ptr + call_site_table_nbytes;
+
+    assert(info->landing_pad_base);
+    assert(info->call_site_table);
+    assert(info->action_table);
+}
+
+static inline const uint8_t* parse_call_site_entry(const uint8_t* ptr, const lsda_info_t* info,
+                                                   call_site_entry_t* entry) {
+    size_t instrs_start_offset, instrs_len_bytes, landing_pad_offset, action_offset_plus_one;
+
+    // clang++ recently changed from always doing udata4 here to using uleb128, so we support both
+    unsigned uleb_size;
+    if (DW_EH_PE_uleb128 == info->call_site_table_entry_encoding) {
+        instrs_start_offset = llvm::decodeULEB128(ptr, &uleb_size);
+        ptr += uleb_size;
+        instrs_len_bytes = llvm::decodeULEB128(ptr, &uleb_size);
+        ptr += uleb_size;
+        landing_pad_offset = llvm::decodeULEB128(ptr, &uleb_size);
+        ptr += uleb_size;
+    } else if (DW_EH_PE_udata4 == info->call_site_table_entry_encoding) {
+        // offsets are from landing pad base
+        instrs_start_offset = (size_t) * (const uint32_t*)ptr;
+        instrs_len_bytes = (size_t) * (const uint32_t*)(ptr + 4);
+        landing_pad_offset = (size_t) * (const uint32_t*)(ptr + 8);
+        ptr += 12;
+    } else {
+        RELEASE_ASSERT(0, "expected call site table entries to use DW_EH_PE_udata4 or DW_EH_PE_uleb128");
+    }
+
+    // action offset (plus one) is always a ULEB128
+    action_offset_plus_one = llvm::decodeULEB128(ptr, &uleb_size);
+    ptr += uleb_size;
+
+    entry->instrs_start = info->landing_pad_base + instrs_start_offset;
+    entry->instrs_len_bytes = instrs_len_bytes;
+    if (0 == landing_pad_offset) {
+        // An offset of 0 is special and indicates "no landing pad", i.e. this call site does not handle exceptions or
+        // perform any cleanup. (The call site entry is still necessary to indicate that it is *expected* that an
+        // exception could be thrown here, and that unwinding should proceed; if the entry were absent, we'd call
+        // std::terminate().)
+        entry->landing_pad = nullptr;
+    } else {
+        entry->landing_pad = info->landing_pad_base + landing_pad_offset;
+    }
+    entry->action_offset_plus_one = action_offset_plus_one;
+
+    return ptr;
+}
+
+static inline const uint8_t* first_action(const lsda_info_t* info, const call_site_entry_t* entry) {
+    if (!entry->action_offset_plus_one)
+        return nullptr;
+    return info->action_table + entry->action_offset_plus_one - 1;
+}
+
+// Returns pointer to next action, or NULL if no next action.
+// Stores type filter into `*type_filter', stores number of bytes read into `*num_bytes', unless it is null.
+static inline const uint8_t* next_action(const uint8_t* action_ptr, int64_t* type_filter,
+                                         unsigned* num_bytes = nullptr) {
+    assert(type_filter);
+    unsigned leb_size, total_size;
+    *type_filter = llvm::decodeSLEB128(action_ptr, &leb_size);
+    action_ptr += leb_size;
+    total_size = leb_size;
+    intptr_t offset_to_next_entry = llvm::decodeSLEB128(action_ptr, &leb_size);
+    total_size += leb_size;
+    if (num_bytes) {
+        *num_bytes = total_size;
+    }
+    // an offset of 0 ends the action-chain.
+    return offset_to_next_entry ? action_ptr + offset_to_next_entry : nullptr;
+}
+
+
+// ---------- Printing things for debugging purposes ----------
+static void print_lsda(const lsda_info_t* info) {
+    size_t action_table_min_len_bytes = 0;
+
+    // Print call site table.
+    printf("Call site table:\n");
+    const uint8_t* p = info->call_site_table;
+    assert(p);
+    while (p < info->action_table) { // the call site table ends where the action table begins
+        call_site_entry_t entry;
+        p = parse_call_site_entry(p, info, &entry);
+        printf("  start %p end %p landingpad %p action-plus-one %lx\n", entry.instrs_start,
+               entry.instrs_start + entry.instrs_len_bytes, entry.landing_pad, entry.action_offset_plus_one);
+
+        // Follow the action chain.
+        for (const uint8_t* action_ptr = first_action(info, &entry); action_ptr;) {
+            RELEASE_ASSERT(action_ptr >= info->action_table, "malformed LSDA");
+            ptrdiff_t offset = action_ptr - info->action_table;
+            // add one to indicate that there is an entry here. (consider the case of an empty table, for example.)
+            // would be nicer to set action_table_min_len_bytes to the end of the entry, but that involves uleb-size
+            // arithmetic.
+            if (offset + 1 > action_table_min_len_bytes)
+                action_table_min_len_bytes = offset + 1;
+
+            int64_t type_filter;
+            action_ptr = next_action(action_ptr, &type_filter);
+            if (action_ptr)
+                printf("    %ld: filter %ld  next %ld\n", offset, type_filter, action_ptr - info->action_table);
+            else
+                printf("    %ld: filter %ld  end\n", offset, type_filter);
+        }
+    }
+
+    // Print the action table.
+    printf("Action table:\n");
+    RELEASE_ASSERT(p == info->action_table, "malformed LSDA");
+    while (p < info->action_table + action_table_min_len_bytes) {
+        assert(p);
+        ptrdiff_t offset = p - info->action_table;
+        unsigned num_bytes;
+        int64_t type_filter;
+        const uint8_t* next = next_action(p, &type_filter, &num_bytes);
+        p += num_bytes;
+
+        if (next)
+            printf("  %ld: filter %ld  next %ld\n", offset, type_filter, p - info->action_table);
+        else
+            printf("  %ld: filter %ld  end\n", offset, type_filter);
+    }
+}
+
+// FIXME: duplicated from unwinding.cpp
+static unw_word_t getFunctionEnd(unw_word_t ip) {
+    unw_proc_info_t pip;
+    // where is the documentation for unw_get_proc_info_by_ip, anyway?
+    int ret = unw_get_proc_info_by_ip(unw_local_addr_space, ip, &pip, NULL);
+    RELEASE_ASSERT(ret == 0 && pip.end_ip, "");
+    return pip.end_ip;
+}
+
+static void print_frame(unw_cursor_t* cursor, const unw_proc_info_t* pip) {
+    // FIXME: code duplication with PythonFrameIter::incr
+    static unw_word_t interpreter_instr_end = getFunctionEnd((unw_word_t)interpreter_instr_addr);
+    static unw_word_t generator_entry_end = getFunctionEnd((unw_word_t)generatorEntry);
+
+    unw_word_t ip, bp;
+    check(unw_get_reg(cursor, UNW_REG_IP, &ip));
+    check(unw_get_reg(cursor, UNW_TDEP_BP, &bp));
+
+    // NB. unw_get_proc_name is MUCH slower than dl_addr for getting the names of functions, but it gets the names of
+    // more functions. However, it also has a bug that pops up when used on JITted functions, so we use dladdr for now.
+    // (I've put an assert in libunwind that'll catch, but not fix, the bug.) - rntz
+
+    // {
+    //     char name[500];
+    //     unw_word_t off;
+    //     int err = unw_get_proc_name(cursor, name, 500, &off);
+    //     // ENOMEM means name didn't fit in buffer, so it was truncated. We're okay with that.
+    //     RELEASE_ASSERT(!err || err == -UNW_ENOMEM || err == -UNW_ENOINFO, "unw_get_proc_name errored");
+    //     if (err != -UNW_ENOINFO) {
+    //         printf(strnlen(name, 500) < 50 ? "  %-50s" : "  %s\n", name);
+    //     } else {
+    //         printf("  %-50s", "? (no info)");
+    //     }
+    // }
+
+    {
+        Dl_info dl_info;
+        if (dladdr((void*)ip, &dl_info)) { // returns non-zero on success, zero on failure
+            if (!dl_info.dli_sname || strlen(dl_info.dli_sname) < 50)
+                printf("  %-50s", dl_info.dli_sname ? dl_info.dli_sname : "(unnamed)");
+            else
+                printf("  %s\n", dl_info.dli_sname);
+        } else {
+            printf("  %-50s", "? (no dl info)");
+        }
+    }
+
+    CompiledFunction* cf = getCFForAddress(ip);
+    AST_stmt* cur_stmt = nullptr;
+    enum { COMPILED, INTERPRETED, GENERATOR, OTHER } frame_type;
+    if (cf) {
+        // compiled frame
+        frame_type = COMPILED;
+        printf("      ip %12lx  bp %lx    JITTED\n", ip, bp);
+        // TODO: get current statement
+    } else if ((unw_word_t)interpreter_instr_addr <= ip && ip < interpreter_instr_end) {
+        // interpreted frame
+        frame_type = INTERPRETED;
+        printf("      ip %12lx  bp %lx    interpreted\n", ip, bp);
+        // sometimes this assert()s!
+        // cf = getCFForInterpretedFrame((void*)bp);
+        // cur_stmt = getCurrentStatementForInterpretedFrame((void*) bp);
+    } else if ((unw_word_t)generatorEntry <= ip && ip < generator_entry_end) {
+        // generator return frame
+        frame_type = GENERATOR;
+        printf("      ip %12lx  bp %lx    generator\n", ip, bp);
+    } else {
+        // generic frame, probably C/C++
+        frame_type = OTHER;
+        printf("      ip %12lx  bp %lx\n", ip, bp);
+    }
+
+    if (frame_type == INTERPRETED && cf && cur_stmt) {
+        auto source = cf->clfunc->source.get();
+        // FIXME: dup'ed from lineInfoForFrame
+        LineInfo line(cur_stmt->lineno, cur_stmt->col_offset, source->fn, source->getName());
+        printf("      File \"%s\", line %d, in %s\n", line.file.c_str(), line.line, line.func.c_str());
+    }
+}
+
+
+// ---------- Helpers for unwind_loop ----------
+static inline bool find_call_site_entry(const lsda_info_t* info, const uint8_t* ip, call_site_entry_t* entry) {
+    const uint8_t* p = info->call_site_table;
+    while (p < info->action_table) { // The call site table ends where the action table begins.
+        p = parse_call_site_entry(p, info, entry);
+
+        if (VERBOSITY("cxx_unwind") >= 3) {
+            printf("    start %p end %p landingpad %p action %lx\n", entry->instrs_start,
+                   entry->instrs_start + entry->instrs_len_bytes, entry->landing_pad, entry->action_offset_plus_one);
+        }
+
+        // If our IP is in the given range, we found the right entry!
+        if (entry->instrs_start <= ip && ip < entry->instrs_start + entry->instrs_len_bytes)
+            return true;
+
+        // The call-site table is in sorted order by start IP. If we've passed our current IP, we won't find an entry.
+        if (ip < entry->instrs_start + entry->instrs_len_bytes)
+            break;
+    }
+
+    // If p actually overran *into* info.action_table, we have a malformed LSDA.
+    ASSERT(!(p > info->action_table), "Malformed LSDA; call site entry overlaps action table!");
+    return false;
+}
+
+static inline NORETURN void resume(unw_cursor_t* cursor, const uint8_t* landing_pad, int64_t switch_value,
+                                   const ExcData* exc_data) {
+    exc_data->check();
+    assert(landing_pad);
+    if (VERBOSITY("cxx_unwind") >= 2)
+        printf("  * RESUMED: ip %p  switch_value %ld\n", (const void*)landing_pad, (long)switch_value);
+
+    if (0 != switch_value) {
+        // The exception handler will call __cxa_begin_catch, which stops this timer and logs it.
+        per_thread_resume_catch_timer.restart("resume_catch", 20);
+    } else {
+        // The cleanup code will call _Unwind_Resume, which will stop this timer and log it.
+        // TODO: am I sure cleanup code can't raise exceptions? maybe have an assert!
+        per_thread_cleanup_timer.restart("cleanup", 20);
+#ifndef NDEBUG
+        in_cleanup_code = true;
+#endif
+    }
+
+    // set rax to pointer to exception object
+    // set rdx to the switch_value (0 for cleanup, otherwise an index indicating which exception handler to use)
+    //
+    // NB. assumes x86-64. maybe I should use __builtin_eh_return_data_regno() here?
+    // but then, need to translate into UNW_* values somehow. not clear how.
+    check(unw_set_reg(cursor, UNW_X86_64_RAX, (unw_word_t)exc_data));
+    check(unw_set_reg(cursor, UNW_X86_64_RDX, switch_value));
+
+    // resume!
+    check(unw_set_reg(cursor, UNW_REG_IP, (unw_word_t)landing_pad));
+    unw_resume(cursor);
+    RELEASE_ASSERT(0, "unw_resume returned!");
+}
+
+// Determines whether to dispatch to cleanup code or an exception handler based on the action table.
+// Doesn't need exception info b/c in Pyston we assume all handlers catch all exceptions.
+//
+// Returns the switch value to be passed into the landing pad, which selects which handler gets run in the case of
+// multiple `catch' blocks, or is 0 to run cleanup code.
+static inline int64_t determine_action(const lsda_info_t* info, const call_site_entry_t* entry) {
+    // No action means there are destructors/cleanup to run, but no exception handlers.
+    const uint8_t* p = first_action(info, entry);
+    if (!p)
+        return CLEANUP_ACTION;
+
+    // Read a chain of actions.
+    if (VERBOSITY("cxx_unwind") >= 3) {
+        printf("      reading action chain\n");
+    }
+
+    // When we see a cleanup action, we *don't* immediately take it. Rather, we remember that we should clean up if none
+    // of the other actions matched.
+    bool saw_cleanup = false;
+    do {
+        ASSERT(p >= info->action_table, "malformed LSDA");
+        ptrdiff_t offset = p - info->action_table;
+        int64_t type_filter;
+        p = next_action(p, &type_filter);
+        if (VERBOSITY("cxx_unwind") >= 3) {
+            if (p)
+                printf("      %ld: filter %ld  next %ld\n", offset, type_filter, p - info->action_table);
+            else
+                printf("      %ld: filter %ld  end\n", offset, type_filter);
+        }
+
+        if (0 == type_filter) {
+            // A type_filter of 0 indicates a cleanup.
+            saw_cleanup = true;
+        } else {
+            // Otherwise, the type_filter is supposed to be interpreted by looking up information in the types table and
+            // comparing it against the type of the exception thrown. In Pyston, however, every exception handler
+            // handles all exceptions, so we ignore the type information entirely and just run the handler.
+            //
+            // I don't fully understand negative type filters. For now we don't implement them. See
+            // http://www.airs.com/blog/archives/464 for some information.
+            RELEASE_ASSERT(type_filter > 0, "negative type filters unimplemented");
+            return type_filter;
+        }
+    } while (p);
+
+    if (saw_cleanup)
+        return CLEANUP_ACTION;
+
+    // We ran through the whole action chain and none applied, *and* there was no cleanup indicated. What do we do?
+    // This can't happen currently, but I think the answer is probably panic().
+    RELEASE_ASSERT(0, "action chain exhausted and no cleanup indicated");
+}
+
+static inline int step(unw_cursor_t* cp) {
+    LogTimer t("unw_step", us_unwind_step, 5);
+    return unw_step(cp);
+}
+
+// The stack-unwinding loop.
+// TODO: integrate incremental traceback generation into this function
+static inline void unwind_loop(const ExcData* exc_data) {
+    Timer t("unwind_loop", 50);
+
+    // NB. https://monoinfinito.wordpress.com/series/exception-handling-in-c/ is a very useful resource
+    // as are http://www.airs.com/blog/archives/460 and http://www.airs.com/blog/archives/464
+    unw_cursor_t cursor;
+    unw_context_t uc; // exists only to initialize cursor
+#ifndef NDEBUG
+    // poison stack memory. have had problems with these structures being insufficiently initialized.
+    memset(&uc, 0xef, sizeof uc);
+    memset(&cursor, 0xef, sizeof cursor);
+#endif
+    unw_getcontext(&uc);
+    unw_init_local(&cursor, &uc);
+
+    while (step(&cursor) > 0) {
+        unw_proc_info_t pip;
+        {
+            // NB. unw_get_proc_info is slow; a significant chunk of all time spent unwinding is spent here.
+            LogTimer t_procinfo("get_proc_info", us_unwind_get_proc_info, 10);
+            check(unw_get_proc_info(&cursor, &pip));
+        }
+        assert((pip.lsda == 0) == (pip.handler == 0));
+        assert(pip.flags == 0);
+
+        if (VERBOSITY("cxx_unwind") >= 2) {
+            print_frame(&cursor, &pip);
+        }
+
+        // Skip frames without handlers
+        if (pip.handler == 0) {
+            continue;
+        }
+
+        RELEASE_ASSERT(pip.handler == (uintptr_t)__gxx_personality_v0,
+                       "personality function other than __gxx_personality_v0; "
+                       "don't know how to unwind through non-C++ functions");
+
+        // Don't call __gxx_personality_v0; we perform dispatch ourselves.
+        // 1. parse LSDA header
+        lsda_info_t info;
+        parse_lsda_header(&pip, &info);
+
+        call_site_entry_t entry;
+        {
+            LogTimer t_call_site("find_call_site_entry", us_unwind_find_call_site_entry, 10);
+
+            // 2. Find our current IP in the call site table.
+            unw_word_t ip;
+            unw_get_reg(&cursor, UNW_REG_IP, &ip);
+            // ip points to the instruction *after* the instruction that caused the error - which is generally (always?)
+            // a call instruction - UNLESS we're in a signal frame, in which case it points at the instruction that
+            // caused the error. For now, we assume we're never in a signal frame. So, we decrement it by one.
+            //
+            // TODO: double-check that we never hit a signal frame.
+            --ip;
+
+            bool found = find_call_site_entry(&info, (const uint8_t*)ip, &entry);
+            // If we didn't find an entry, an exception happened somewhere exceptions should never happen; terminate
+            // immediately.
+            if (!found) {
+                panic();
+            }
+        }
+
+        // 3. Figure out what to do based on the call site entry.
+        if (!entry.landing_pad) {
+            // No landing pad means no exception handling or cleanup; keep unwinding!
+            continue;
+        }
+        // After this point we are guaranteed to resume something rather than unwinding further.
+
+        if (VERBOSITY("cxx_unwind") >= 3) {
+            print_lsda(&info);
+        }
+
+        int64_t switch_value = determine_action(&info, &entry);
+        us_unwind_loop.log(t.end());
+        resume(&cursor, entry.landing_pad, switch_value, exc_data);
+    }
+
+    us_unwind_loop.log(t.end());
+    // Hit end of stack! return & let unwindException determine what to do.
+}
+
+
+// The unwinder entry-point.
+static void unwind(const ExcData* exc) {
+    exc->check();
+    if (exc->exc.value->hasattr("magic_break")) {
+        (void)(0 == 0);
+    }
+    unwind_loop(exc);
+    // unwind_loop returned, couldn't find any handler. ruh-roh.
+    panic();
+}
+
+} // extern "C"
+} // namespace pyston
+
+
+// Standard library / runtime functions we override
+#if PYSTON_CUSTOM_UNWINDER
+
+void std::terminate() noexcept {
+    // The default std::terminate assumes things about the C++ exception state which aren't true for our custom
+    // unwinder.
+    RELEASE_ASSERT(0, "std::terminate() called!");
+}
+
+// wrong type signature, but that's okay, it's extern "C"
+extern "C" void __gxx_personality_v0() {
+    RELEASE_ASSERT(0, "__gxx_personality_v0 should never get called");
+}
+
+extern "C" void _Unwind_Resume(struct _Unwind_Exception* _exc) {
+    assert(pyston::in_cleanup_code);
+#ifndef NDEBUG
+    pyston::in_cleanup_code = false;
+#endif
+    pyston::us_unwind_cleanup.log(pyston::per_thread_cleanup_timer.end());
+
+    if (VERBOSITY("cxx_unwind"))
+        printf("***** _Unwind_Resume() *****\n");
+    // we give `_exc' type `struct _Unwind_Exception*' because unwind.h demands it; it's not actually accurate
+    const pyston::ExcData* data = (const pyston::ExcData*)_exc;
+    pyston::unwind(data);
+}
+
+// C++ ABI functionality
+namespace __cxxabiv1 {
+
+extern "C" void* __cxa_allocate_exception(size_t size) noexcept {
+    // we should only ever be throwing ExcInfos
+    RELEASE_ASSERT(size == sizeof(pyston::ExcInfo), "allocating exception whose size doesn't match ExcInfo");
+
+    // Instead of allocating memory for this exception, we return a pointer to a pre-allocated thread-local variable.
+    //
+    // This variable, pyston::exception_ferry, is used only while we are unwinding, and should not be used outside of
+    // the unwinder. Since it's a thread-local variable, we *cannot* throw any exceptions while it is live, otherwise we
+    // would clobber it and forget our old exception.
+    //
+    // Q: Why can't we just use cur_thread_state.curexc_{type,value,traceback}?
+    //
+    // A: Because that conflates the space used to store exceptions during C++ unwinding with the space used to store
+    // them during C-API return-code based unwinding! This actually comes up in practice - the original version *did*
+    // use curexc_{type,value,traceback}, and it had a bug.
+    //
+    // In particular, we need to unset the C API exception at an appropriate point so as not to make C-API functions
+    // *think* an exception is being thrown when one isn't. The natural place is __cxa_begin_catch, BUT we need some way
+    // to communicate the exception info to the inside of the catch block - and all we get is the return value of
+    // __cxa_begin_catch, which is a single pointer, when we need three!
+    //
+    // You might think we could get away with only unsetting the C-API information in __cxa_end_catch, but you'd be
+    // wrong! Firstly, this would prohibit calling C-API functions inside a catch-block. Secondly, __cxa_end_catch is
+    // always called when leaving a catch block, even if we're leaving it by re-raising the exception. So if we store
+    // our exception info in curexc_*, and then unset these in __cxa_end_catch, then we'll wipe our exception info
+    // during unwinding!
+
+    return (void*)&pyston::exception_ferry;
+}
+
+// Takes the value that resume() sent us in RAX, and returns a pointer to the exception object actually thrown. In our
+// case, these are the same, and should always be &pyston::exception_ferry.
+extern "C" void* __cxa_begin_catch(void* exc_obj_in) noexcept {
+    assert(exc_obj_in);
+    pyston::us_unwind_resume_catch.log(pyston::per_thread_resume_catch_timer.end());
+
+    if (VERBOSITY("cxx_unwind"))
+        printf("***** __cxa_begin_catch() *****\n");
+
+    pyston::ExcData* e = (pyston::ExcData*)exc_obj_in;
+    e->check();
+    return (void*)&e->exc;
+}
+
+extern "C" void __cxa_end_catch() {
+    if (VERBOSITY("cxx_unwind"))
+        printf("***** __cxa_end_catch() *****\n");
+    // See comment in __cxa_begin_catch for why we don't clear the exception ferry here.
+}
+
+// This is the mangled symbol for the type info for pyston::ExcInfo.
+#define EXCINFO_TYPE_INFO _ZTIN6pyston7ExcInfoE
+extern "C" std::type_info EXCINFO_TYPE_INFO;
+
+extern "C" void __cxa_throw(void* exc_obj, std::type_info* tinfo, void (*dtor)(void*)) {
+    assert(!pyston::in_cleanup_code);
+    assert(exc_obj);
+    RELEASE_ASSERT(tinfo == &EXCINFO_TYPE_INFO, "can't throw a non-ExcInfo value! type info: %p", tinfo);
+
+    if (VERBOSITY("cxx_unwind"))
+        printf("***** __cxa_throw() *****\n");
+
+    pyston::unwind((const pyston::ExcData*)exc_obj);
+}
+
+extern "C" void* __cxa_get_exception_ptr(void* exc_obj_in) noexcept {
+    assert(exc_obj_in);
+    pyston::ExcData* e = (pyston::ExcData*)exc_obj_in;
+    e->check();
+    return (void*)&e->exc;
+}
+
+// We deliberately don't implement rethrowing because we can't implement it correctly with our current strategy for
+// storing the exception info. Don't use bare `throw' from inside an exception handler! Instead, do:
+//
+//     try { ... }
+//     catch(ExcInfo e) {   // copies the exception info received to the stack
+//         ...
+//         throw e;
+//     }
+//
+extern "C" void __cxa_rethrow() {
+    RELEASE_ASSERT(0, "__cxa_rethrow() unimplemented; please don't use bare `throw' in Pyston!");
+}
+}
+
+#endif // PYSTON_CUSTOM_UNWINDER
--- a/src/runtime/descr.cpp
+++ b/src/runtime/descr.cpp
@@ -42,7 +42,7 @@ static void propertyDocCopy(BoxedProperty* prop, Box* fget) {
        get_doc = getattrInternal(fget, "__doc__", NULL);
    } catch (ExcInfo e) {
        if (!e.matches(Exception)) {
-            throw;
+            throw e;
        }
        get_doc = NULL;
    }

--- a/src/runtime/ics.cpp
+++ b/src/runtime/ics.cpp
@@ -20,6 +20,7 @@
 #include "codegen/memmgr.h"
 #include "codegen/patchpoints.h"
 #include "codegen/stackmaps.h"
+#include "codegen/unwinding.h" // registerDynamicEhFrame
 #include "core/common.h"
 #include "core/options.h"
 #include "core/stats.h"
@@ -142,10 +143,12 @@ static const char _eh_frame_template[] =
    "\x00\x00\x00\x00" // terminator
    ;
 #endif
-#define EH_FRAME_SIZE sizeof(_eh_frame_template)
+#define EH_FRAME_SIZE (sizeof(_eh_frame_template) - 1) // omit string-terminating null byte
+
+static_assert(sizeof("") == 1, "strings are null-terminated");

 static void writeTrivialEhFrame(void* eh_frame_addr, void* func_addr, uint64_t func_size) {
-    memcpy(eh_frame_addr, _eh_frame_template, sizeof(_eh_frame_template));
+    memcpy(eh_frame_addr, _eh_frame_template, EH_FRAME_SIZE);

    int32_t* offset_ptr = (int32_t*)((uint8_t*)eh_frame_addr + 0x20);
    int32_t* size_ptr = (int32_t*)((uint8_t*)eh_frame_addr + 0x24);
@@ -162,6 +165,9 @@ void EHFrameManager::writeAndRegister(void* func_addr, uint64_t func_size) {
    assert(eh_frame_addr == NULL);
    eh_frame_addr = malloc(EH_FRAME_SIZE);
    writeTrivialEhFrame(eh_frame_addr, func_addr, func_size);
+    // (EH_FRAME_SIZE - 4) to omit the 4-byte null terminator, otherwise we trip an assert in parseEhFrame.
+    // TODO: can we omit the terminator in general?
+    registerDynamicEhFrame((uint64_t)func_addr, func_size, (uint64_t)eh_frame_addr, EH_FRAME_SIZE - 4);
    registerEHFrames((uint8_t*)eh_frame_addr, (uint64_t)eh_frame_addr, EH_FRAME_SIZE);
 }


--- a/src/runtime/objmodel.cpp
+++ b/src/runtime/objmodel.cpp
@@ -4968,7 +4968,7 @@ extern "C" Box* boxedLocalsGet(Box* boxedLocals, const char* attr, Box* globals)
            // If it throws a KeyError, then the variable doesn't exist so move on
            // and check the globals (below); otherwise, just propogate the exception.
            if (!isSubclass(e.value->cls, KeyError)) {
-                throw;
+                throw e;
            }
        }
    }

--- a/src/runtime/stacktrace.cpp
+++ b/src/runtime/stacktrace.cpp
@@ -45,59 +45,10 @@ void showBacktrace() {
    }
 }

-// Currently-unused libunwind-based unwinding:
-void unwindExc(Box* exc_obj) __attribute__((noreturn));
-void unwindExc(Box* exc_obj) {
-    unw_cursor_t cursor;
-    unw_context_t uc;
-    unw_word_t ip, sp;
-
-    unw_getcontext(&uc);
-    unw_init_local(&cursor, &uc);
-
-    int code;
-    unw_proc_info_t pip;
-
-    while (unw_step(&cursor) > 0) {
-        unw_get_reg(&cursor, UNW_REG_IP, &ip);
-        unw_get_reg(&cursor, UNW_REG_SP, &sp);
-        printf("ip = %lx, sp = %lx\n", (long)ip, (long)sp);
-
-        code = unw_get_proc_info(&cursor, &pip);
-        RELEASE_ASSERT(code == 0, "");
-
-        // printf("%lx %lx %lx %lx %lx %lx %d %d %p\n", pip.start_ip, pip.end_ip, pip.lsda, pip.handler, pip.gp,
-        // pip.flags, pip.format, pip.unwind_info_size, pip.unwind_info);
-
-        assert((pip.lsda == 0) == (pip.handler == 0));
-        assert(pip.flags == 0);
-
-        if (pip.handler == 0) {
-            if (VERBOSITY())
-                printf("Skipping frame without handler\n");
-
-            continue;
-        }
-
-        printf("%lx %lx %lx\n", pip.lsda, pip.handler, pip.flags);
-        // assert(pip.handler == (uintptr_t)__gxx_personality_v0 || pip.handler == (uintptr_t)__py_personality_v0);
-
-        // auto handler_fn = (int (*)(int, int, uint64_t, void*, void*))pip.handler;
-        ////handler_fn(1, 1 /* _UA_SEARCH_PHASE */, 0 /* exc_class */, NULL, NULL);
-        // handler_fn(2, 2 /* _UA_SEARCH_PHASE */, 0 /* exc_class */, NULL, NULL);
-        unw_set_reg(&cursor, UNW_REG_IP, 1);
-
-        // TODO testing:
-        // unw_resume(&cursor);
-    }
-
-    abort();
-}
-
 void raiseRaw(const ExcInfo& e) __attribute__((__noreturn__));
 void raiseRaw(const ExcInfo& e) {
    STAT_TIMER(t0, "us_timer_raiseraw");
-    // Should set these to None before getting here:
+    // Should set these to None rather than null before getting here:
    assert(e.type);
    assert(e.value);
    assert(e.traceback);
@@ -105,11 +56,7 @@ void raiseRaw(const ExcInfo& e) {
    assert(gc::isValidGCObject(e.value));
    assert(gc::isValidGCObject(e.traceback));

-    // Using libgcc:
    throw e;
-
-    // Using libunwind
-    // unwindExc(exc_obj);
 }

 void raiseExc(Box* exc_obj) {

--- a/test/tests/dash_c.py
+++ b/test/tests/dash_c.py
@@ -7,7 +7,10 @@ with open('/dev/null')as ignore:
    # We don't (yet?) require exact stderr or return code compatibility w/
    # python. So we just check that we succeed or fail as appropriate.
    def run(args):
-        print subprocess.call([me] + args, stderr=ignore)
+        code = 0 == subprocess.call([me] + args, stderr=ignore)
+        sys.stdout.flush()
+        print code
+        sys.stdout.flush()

    run(["-c", "print 2 + 2"])
    run(["-c", "import sys; print sys.argv", "hello", "world"])

--- a/test/tests/raise_2arg.py
+++ b/test/tests/raise_2arg.py
+# two-argument `raise' statements where second argument is itself an exception
+class A(Exception): pass
+class B(Exception): pass
+
+def f():
+    try: raise A, B(2)
+    except A as e:
+        print 'A', e
+    except B as e:
+        print 'B', e
+f()
--- a/tools/git_am_automated.py
+++ b/tools/git_am_automated.py
+import os, os.path
+import subprocess
+import sys
+
+def main():
+    repo = sys.argv[1]
+    patches = sys.argv[2:]
+    gitfile = os.path.join(repo, '.git')
+
+    assert os.path.isdir(repo), "Expected to find repo at %s" % (repo,)
+    assert os.path.exists(gitfile), "Expected %s to exist" % (gitfile,)
+    for fn in patches:
+        assert os.path.exists(fn), "Expected a patch file/dir at %s" % (fn,)
+
+    os.chdir(repo)
+    code = subprocess.call(["git", "am", "--"] + patches)
+    if not code:
+        sys.exit(0)
+
+    # git am errored. recover by unconditionally aborting.
+    print >>sys.stderr, "----- Running `git am --abort' -----"
+    subprocess.check_call(["git", "am", "--abort"])
+    sys.exit(1)
+
+if __name__ == '__main__':
+    main()