about summary refs log tree commit diff
path: root/src/libstd
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2013-10-19 09:46:18 -0700
committerbors <bors@rust-lang.org>2013-10-19 09:46:18 -0700
commit31a209ca4251126dc03cfbb9f4bbb54f9d296d5d (patch)
treeb31873267870fb2aaf597d1e9bfdd88448e3890a /src/libstd
parent5751794d97941176571bdf7c16f909ac9845520c (diff)
parent6d8330afb6c925d1092f27919f61d4ce6a3fb1d4 (diff)
downloadrust-31a209ca4251126dc03cfbb9f4bbb54f9d296d5d.tar.gz
rust-31a209ca4251126dc03cfbb9f4bbb54f9d296d5d.zip
auto merge of #9834 : alexcrichton/rust/morestack, r=brson
This commit re-introduces the functionality of __morestack in a way that it was
not originally anticipated. Rust does not currently have segmented stacks,
rather just large stack segments. We do not detect when these stack segments are
overrun currently, but this commit leverages __morestack in order to check this.

This commit purges a lot of the old __morestack and stack limit C++
functionality, migrating the necessary chunks to rust. The stack limit is now
entirely maintained in rust, and the "main logic bits" of __morestack are now
also implemented in rust as well.

I put my best effort into validating that this currently builds and runs successfully on osx and linux 32/64 bit, but I was unable to get this working on windows. We never did have unwinding through __morestack frames, and although I tried poking at it for a bit, I was unable to understand why we don't get unwinding right now.

A focus of this commit is to implement as much of the logic in rust as possible. This involved some liberal usage of `no_split_stack` in various locations, along with some use of the `asm!` macro (scary). I modified a bit of C++ to stop calling `record_sp_limit` because this is no longer defined in C++, rather in rust.

Another consequence of this commit is that `thread_local_storage::{get, set}` must both be flagged with `#[rust_stack]`. I've briefly looked at the implementations on osx/linux/windows to ensure that they're pretty small stacks, and I'm pretty sure that they're definitely less than 20K stacks, so we probably don't have a lot to worry about.

Other things worthy of note:
* The default stack size is now 4MB instead of 2MB. This is so that when we request 2MB to call a C function you don't immediately overflow because you have consumed any stack at all.
* `asm!` is actually pretty cool, maybe we could actually define context switching with it?
* I wanted to add links to the internet about all this jazz of storing information in TLS, but I was only able to find a link for the windows implementation. Otherwise my suggestion is just "disassemble on that arch and see what happens"
* I put my best effort forward on arm/mips to tweak __morestack correctly, we have no ability to test this so an extra set of eyes would be useful on these spots.
* This is all really tricky stuff, so I tried to put as many comments as I thought were necessary, but if anything is still unclear (or I completely forgot to take something into account), I'm willing to write more!
Diffstat (limited to 'src/libstd')
-rw-r--r--src/libstd/rt/context.rs283
-rw-r--r--src/libstd/rt/crate_map.rs2
-rw-r--r--src/libstd/rt/env.rs2
-rw-r--r--src/libstd/rt/sched.rs2
-rw-r--r--src/libstd/rt/task.rs102
-rw-r--r--src/libstd/rt/thread.rs43
-rw-r--r--src/libstd/rt/thread_local_storage.rs45
7 files changed, 412 insertions, 67 deletions
diff --git a/src/libstd/rt/context.rs b/src/libstd/rt/context.rs
index 222f9a44b17..7f7545ca230 100644
--- a/src/libstd/rt/context.rs
+++ b/src/libstd/rt/context.rs
@@ -11,9 +11,12 @@
 use option::*;
 use super::stack::StackSegment;
 use libc::c_void;
+use uint;
 use cast::{transmute, transmute_mut_unsafe,
            transmute_region, transmute_mut_region};
 
+pub static RED_ZONE: uint = 20 * 1024;
+
 // FIXME #7761: Registers is boxed so that it is 16-byte aligned, for storing
 // SSE regs.  It would be marginally better not to do this. In C++ we
 // use an attribute on a struct.
@@ -24,14 +27,17 @@ pub struct Context {
     /// The context entry point, saved here for later destruction
     start: Option<~~fn()>,
     /// Hold the registers while the task or scheduler is suspended
-    regs: ~Registers
+    regs: ~Registers,
+    /// Lower bound and upper bound for the stack
+    stack_bounds: Option<(uint, uint)>,
 }
 
 impl Context {
     pub fn empty() -> Context {
         Context {
             start: None,
-            regs: new_regs()
+            regs: new_regs(),
+            stack_bounds: None,
         }
     }
 
@@ -47,7 +53,6 @@ impl Context {
 
         let fp: *c_void = task_start_wrapper as *c_void;
         let argp: *c_void = unsafe { transmute::<&~fn(), *c_void>(&*start) };
-        let stack_base: *uint = stack.start();
         let sp: *uint = stack.end();
         let sp: *mut uint = unsafe { transmute_mut_unsafe(sp) };
         // Save and then immediately load the current context,
@@ -57,11 +62,23 @@ impl Context {
             swap_registers(transmute_mut_region(&mut *regs), transmute_region(&*regs));
         };
 
-        initialize_call_frame(&mut *regs, fp, argp, sp, stack_base);
+        initialize_call_frame(&mut *regs, fp, argp, sp);
 
+        // Scheduler tasks don't have a stack in the "we allocated it" sense,
+        // but rather they run on pthreads stacks. We have complete control over
+        // them in terms of the code running on them (and hopefully they don't
+        // overflow). Additionally, their coroutine stacks are listed as being
+        // zero-length, so that's how we detect what's what here.
+        let stack_base: *uint = stack.start();
+        let bounds = if sp as uint == stack_base as uint {
+            None
+        } else {
+            Some((stack_base as uint, sp as uint))
+        };
         return Context {
             start: Some(start),
-            regs: regs
+            regs: regs,
+            stack_bounds: bounds,
         }
     }
 
@@ -79,8 +96,25 @@ impl Context {
         let in_regs: &Registers = match in_context {
             &Context { regs: ~ref r, _ } => r
         };
-        rtdebug!("doing raw swap");
-        unsafe { swap_registers(out_regs, in_regs) };
+
+        rtdebug!("noting the stack limit and doing raw swap");
+
+        unsafe {
+            // Right before we switch to the new context, set the new context's
+            // stack limit in the OS-specified TLS slot. This also  means that
+            // we cannot call any more rust functions after record_stack_bounds
+            // returns because they would all likely fail due to the limit being
+            // invalid for the current task. Lucky for us `swap_registers` is a
+            // C function so we don't have to worry about that!
+            match in_context.stack_bounds {
+                Some((lo, hi)) => record_stack_bounds(lo, hi),
+                // If we're going back to one of the original contexts or
+                // something that's possibly not a "normal task", then reset
+                // the stack limit to 0 to make morestack never fail
+                None => record_stack_bounds(0, uint::max_value),
+            }
+            swap_registers(out_regs, in_regs)
+        }
     }
 }
 
@@ -89,6 +123,29 @@ extern {
     fn swap_registers(out_regs: *mut Registers, in_regs: *Registers);
 }
 
+// Register contexts used in various architectures
+//
+// These structures all represent a context of one task throughout its
+// execution. Each struct is a representation of the architecture's register
+// set. When swapping between tasks, these register sets are used to save off
+// the current registers into one struct, and load them all from another.
+//
+// Note that this is only used for context switching, which means that some of
+// the registers may go unused. For example, for architectures with
+// callee/caller saved registers, the context will only reflect the callee-saved
+// registers. This is because the caller saved registers are already stored
+// elsewhere on the stack (if it was necessary anyway).
+//
+// Additionally, there may be fields on various architectures which are unused
+// entirely because they only reflect what is theoretically possible for a
+// "complete register set" to show, but user-space cannot alter these registers.
+// An example of this would be the segment selectors for x86.
+//
+// These structures/functions are roughly in-sync with the source files inside
+// of src/rt/arch/$arch. The only currently used function from those folders is
+// the `swap_registers` function, but that's only because for now segmented
+// stacks are disabled.
+
 #[cfg(target_arch = "x86")]
 struct Registers {
     eax: u32, ebx: u32, ecx: u32, edx: u32,
@@ -109,7 +166,7 @@ fn new_regs() -> ~Registers {
 
 #[cfg(target_arch = "x86")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
 
     let sp = align_down(sp);
     let sp = mut_offset(sp, -4);
@@ -125,6 +182,8 @@ fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
     regs.ebp = 0;
 }
 
+// windows requires saving more registers (both general and XMM), so the windows
+// register context must be larger.
 #[cfg(windows, target_arch = "x86_64")]
 type Registers = [uint, ..34];
 #[cfg(not(windows), target_arch = "x86_64")]
@@ -137,29 +196,14 @@ fn new_regs() -> ~Registers { ~([0, .. 22]) }
 
 #[cfg(target_arch = "x86_64")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, stack_base: *uint) {
+                         sp: *mut uint) {
 
-    // Redefinitions from regs.h
+    // Redefinitions from rt/arch/x86_64/regs.h
     static RUSTRT_ARG0: uint = 3;
     static RUSTRT_RSP: uint = 1;
     static RUSTRT_IP: uint = 8;
     static RUSTRT_RBP: uint = 2;
 
-    #[cfg(windows)]
-    fn initialize_tib(regs: &mut Registers, sp: *mut uint, stack_base: *uint) {
-        // Redefinitions from regs.h
-        static RUSTRT_ST1: uint = 11; // stack bottom
-        static RUSTRT_ST2: uint = 12; // stack top
-        regs[RUSTRT_ST1] = sp as uint;
-        regs[RUSTRT_ST2] = stack_base as uint;
-    }
-    #[cfg(not(windows))]
-    fn initialize_tib(_: &mut Registers, _: *mut uint, _: *uint) {
-    }
-
-    // Win64 manages stack range at TIB: %gs:0x08 (top) and %gs:0x10 (bottom)
-    initialize_tib(regs, sp, stack_base);
-
     let sp = align_down(sp);
     let sp = mut_offset(sp, -1);
 
@@ -167,9 +211,9 @@ fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
     unsafe { *sp = 0; }
 
     rtdebug!("creating call frame");
-    rtdebug!("fptr {}", fptr as uint);
-    rtdebug!("arg {}", arg as uint);
-    rtdebug!("sp {}", sp as uint);
+    rtdebug!("fptr {}", fptr);
+    rtdebug!("arg {}", arg);
+    rtdebug!("sp {}", sp);
 
     regs[RUSTRT_ARG0] = arg as uint;
     regs[RUSTRT_RSP] = sp as uint;
@@ -187,7 +231,7 @@ fn new_regs() -> ~Registers { ~([0, .. 32]) }
 
 #[cfg(target_arch = "arm")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
     let sp = align_down(sp);
     // sp of arm eabi is 8-byte aligned
     let sp = mut_offset(sp, -2);
@@ -208,7 +252,7 @@ fn new_regs() -> ~Registers { ~([0, .. 32]) }
 
 #[cfg(target_arch = "mips")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
     let sp = align_down(sp);
     // sp of mips o32 is 8-byte aligned
     let sp = mut_offset(sp, -2);
@@ -236,3 +280,182 @@ pub fn mut_offset<T>(ptr: *mut T, count: int) -> *mut T {
     use mem::size_of;
     (ptr as int + count * (size_of::<T>() as int)) as *mut T
 }
+
+#[inline(always)]
+pub unsafe fn record_stack_bounds(stack_lo: uint, stack_hi: uint) {
+    // When the old runtime had segmented stacks, it used a calculation that was
+    // "limit + RED_ZONE + FUDGE". The red zone was for things like dynamic
+    // symbol resolution, llvm function calls, etc. In theory this red zone
+    // value is 0, but it matters far less when we have gigantic stacks because
+    // we don't need to be so exact about our stack budget. The "fudge factor"
+    // was because LLVM doesn't emit a stack check for functions < 256 bytes in
+    // size. Again though, we have giant stacks, so we round all these
+    // calculations up to the nice round number of 20k.
+    record_sp_limit(stack_lo + RED_ZONE);
+
+    return target_record_stack_bounds(stack_lo, stack_hi);
+
+    #[cfg(not(windows))] #[cfg(not(target_arch = "x86_64"))] #[inline(always)]
+    unsafe fn target_record_stack_bounds(_stack_lo: uint, _stack_hi: uint) {}
+    #[cfg(windows, target_arch = "x86_64")] #[inline(always)]
+    unsafe fn target_record_stack_bounds(stack_lo: uint, stack_hi: uint) {
+        // Windows compiles C functions which may check the stack bounds. This
+        // means that if we want to perform valid FFI on windows, then we need
+        // to ensure that the stack bounds are what they truly are for this
+        // task. More info can be found at:
+        //   https://github.com/mozilla/rust/issues/3445#issuecomment-26114839
+        //
+        // stack range is at TIB: %gs:0x08 (top) and %gs:0x10 (bottom)
+        asm!("mov $0, %gs:0x08" :: "r"(stack_lo) :: "volatile");
+        asm!("mov $0, %gs:0x10" :: "r"(stack_hi) :: "volatile");
+    }
+}
+
+/// Records the current limit of the stack as specified by `end`.
+///
+/// This is stored in an OS-dependent location, likely inside of the thread
+/// local storage. The location that the limit is stored is a pre-ordained
+/// location because it's where LLVM has emitted code to check.
+///
+/// Note that this cannot be called under normal circumstances. This function is
+/// changing the stack limit, so upon returning any further function calls will
+/// possibly be triggering the morestack logic if you're not careful.
+///
+/// Also note that this and all of the inside functions are all flagged as
+/// "inline(always)" because they're messing around with the stack limits.  This
+/// would be unfortunate for the functions themselves to trigger a morestack
+/// invocation (if they were an actual function call).
+#[inline(always)]
+pub unsafe fn record_sp_limit(limit: uint) {
+    return target_record_sp_limit(limit);
+
+    // x86-64
+    #[cfg(target_arch = "x86_64", target_os = "macos")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $$0x60+90*8, %rsi
+              movq $0, %gs:(%rsi)" :: "r"(limit) : "rsi" : "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "linux")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $0, %fs:112" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "win32")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        // see: http://en.wikipedia.org/wiki/Win32_Thread_Information_Block
+        // store this inside of the "arbitrary data slot", but double the size
+        // because this is 64 bit instead of 32 bit
+        asm!("movq $0, %gs:0x28" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $0, %fs:24" :: "r"(limit) :: "volatile")
+    }
+
+    // x86
+    #[cfg(target_arch = "x86", target_os = "macos")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movl $$0x48+90*4, %eax
+              movl $0, %gs:(%eax)" :: "r"(limit) : "eax" : "volatile")
+    }
+    #[cfg(target_arch = "x86", target_os = "linux")]
+    #[cfg(target_arch = "x86", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movl $0, %gs:48" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86", target_os = "win32")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        // see: http://en.wikipedia.org/wiki/Win32_Thread_Information_Block
+        // store this inside of the "arbitrary data slot"
+        asm!("movl $0, %fs:0x14" :: "r"(limit) :: "volatile")
+    }
+
+    // mips, arm - Some brave soul can port these to inline asm, but it's over
+    //             my head personally
+    #[cfg(target_arch = "mips")]
+    #[cfg(target_arch = "arm")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        return record_sp_limit(limit as *c_void);
+        extern {
+            #[rust_stack]
+            fn record_sp_limit(limit: *c_void);
+        }
+    }
+}
+
+/// The counterpart of the function above, this function will fetch the current
+/// stack limit stored in TLS.
+///
+/// Note that all of these functions are meant to be exact counterparts of their
+/// brethren above, except that the operands are reversed.
+///
+/// As with the setter, this function does not have a __morestack header and can
+/// therefore be called in a "we're out of stack" situation.
+#[inline(always)]
+// NOTE: after the next snapshot, can remove the initialization before inline
+//       assembly due to an improvement in how it's handled, then this specific
+//       allow directive should get removed.
+#[allow(dead_assignment)]
+pub unsafe fn get_sp_limit() -> uint {
+    return target_get_sp_limit();
+
+    // x86-64
+    #[cfg(target_arch = "x86_64", target_os = "macos")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq $$0x60+90*8, %rsi
+              movq %gs:(%rsi), $0" : "=r"(limit) :: "rsi" : "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "linux")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %fs:112, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "win32")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %gs:0x28, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %fs:24, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+
+    // x86
+    #[cfg(target_arch = "x86", target_os = "macos")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl $$0x48+90*4, %eax
+              movl %gs:(%eax), $0" : "=r"(limit) :: "eax" : "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86", target_os = "linux")]
+    #[cfg(target_arch = "x86", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl %gs:48, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86", target_os = "win32")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl %fs:0x14, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+
+    // mips, arm - Some brave soul can port these to inline asm, but it's over
+    //             my head personally
+    #[cfg(target_arch = "mips")]
+    #[cfg(target_arch = "arm")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        return get_sp_limit() as uint;
+        extern {
+            #[rust_stack]
+            fn get_sp_limit() -> *c_void;
+        }
+    }
+}
diff --git a/src/libstd/rt/crate_map.rs b/src/libstd/rt/crate_map.rs
index 8785dcca7bd..96a0069e851 100644
--- a/src/libstd/rt/crate_map.rs
+++ b/src/libstd/rt/crate_map.rs
@@ -17,7 +17,7 @@ use vec::ImmutableVector;
 // and instead look them up at runtime, which we need to resolve
 // the crate_map properly.
 #[cfg(target_os = "macos")]
-#[link_args = "-undefined dynamic_lookup"]
+#[link_args = "-Wl,-U,__rust_crate_map_toplevel"]
 extern {}
 
 pub struct ModEntry<'self> {
diff --git a/src/libstd/rt/env.rs b/src/libstd/rt/env.rs
index 5b840655120..c02e7fe9013 100644
--- a/src/libstd/rt/env.rs
+++ b/src/libstd/rt/env.rs
@@ -17,7 +17,7 @@ use os;
 // Note that these are all accessed without any synchronization.
 // They are expected to be initialized once then left alone.
 
-static mut MIN_STACK: uint = 2000000;
+static mut MIN_STACK: uint = 4000000;
 static mut DEBUG_BORROW: bool = false;
 
 pub fn init() {
diff --git a/src/libstd/rt/sched.rs b/src/libstd/rt/sched.rs
index 7724f58153e..93ac308df3a 100644
--- a/src/libstd/rt/sched.rs
+++ b/src/libstd/rt/sched.rs
@@ -173,7 +173,7 @@ impl Scheduler {
 
         // Now that we have an empty task struct for the scheduler
         // task, put it in TLS.
-        Local::put::(sched_task);
+        Local::put(sched_task);
 
         // Before starting our first task, make sure the idle callback
         // is active. As we do not start in the sleep state this is
diff --git a/src/libstd/rt/task.rs b/src/libstd/rt/task.rs
index 889d9bb3156..28c38ac9b53 100644
--- a/src/libstd/rt/task.rs
+++ b/src/libstd/rt/task.rs
@@ -29,6 +29,7 @@ use rt::logging::StdErrLogger;
 use super::local_heap::LocalHeap;
 use rt::sched::{Scheduler, SchedHandle};
 use rt::stack::{StackSegment, StackPool};
+use rt::context;
 use rt::context::Context;
 use unstable::finally::Finally;
 use task::spawn::Taskgroup;
@@ -465,6 +466,80 @@ impl Unwinder {
     }
 }
 
+/// This function is invoked from rust's current __morestack function. Segmented
+/// stacks are currently not enabled as segmented stacks, but rather one giant
+/// stack segment. This means that whenever we run out of stack, we want to
+/// truly consider it to be stack overflow rather than allocating a new stack.
+#[no_mangle]      // - this is called from C code
+#[no_split_stack] // - it would be sad for this function to trigger __morestack
+#[doc(hidden)] // XXX: this function shouldn't have to be `pub` to get exported
+               //      so it can be linked against, we should have a better way
+               //      of specifying that.
+pub extern "C" fn rust_stack_exhausted() {
+    use rt::in_green_task_context;
+    use rt::task::Task;
+    use rt::local::Local;
+    use rt::logging::Logger;
+    use unstable::intrinsics;
+
+    unsafe {
+        // We're calling this function because the stack just ran out. We need
+        // to call some other rust functions, but if we invoke the functions
+        // right now it'll just trigger this handler being called again. In
+        // order to alleviate this, we move the stack limit to be inside of the
+        // red zone that was allocated for exactly this reason.
+        let limit = context::get_sp_limit();
+        context::record_sp_limit(limit - context::RED_ZONE / 2);
+
+        // This probably isn't the best course of action. Ideally one would want
+        // to unwind the stack here instead of just aborting the entire process.
+        // This is a tricky problem, however. There's a few things which need to
+        // be considered:
+        //
+        //  1. We're here because of a stack overflow, yet unwinding will run
+        //     destructors and hence arbitrary code. What if that code overflows
+        //     the stack? One possibility is to use the above allocation of an
+        //     extra 10k to hope that we don't hit the limit, and if we do then
+        //     abort the whole program. Not the best, but kind of hard to deal
+        //     with unless we want to switch stacks.
+        //
+        //  2. LLVM will optimize functions based on whether they can unwind or
+        //     not. It will flag functions with 'nounwind' if it believes that
+        //     the function cannot trigger unwinding, but if we do unwind on
+        //     stack overflow then it means that we could unwind in any function
+        //     anywhere. We would have to make sure that LLVM only places the
+        //     nounwind flag on functions which don't call any other functions.
+        //
+        //  3. The function that overflowed may have owned arguments. These
+        //     arguments need to have their destructors run, but we haven't even
+        //     begun executing the function yet, so unwinding will not run the
+        //     any landing pads for these functions. If this is ignored, then
+        //     the arguments will just be leaked.
+        //
+        // Exactly what to do here is a very delicate topic, and is possibly
+        // still up in the air for what exactly to do. Some relevant issues:
+        //
+        //  #3555 - out-of-stack failure leaks arguments
+        //  #3695 - should there be a stack limit?
+        //  #9855 - possible strategies which could be taken
+        //  #9854 - unwinding on windows through __morestack has never worked
+        //  #2361 - possible implementation of not using landing pads
+
+        if in_green_task_context() {
+            do Local::borrow |task: &mut Task| {
+                let n = task.name.as_ref().map(|n| n.as_slice()).unwrap_or("<unnamed>");
+
+                format_args!(|args| { task.logger.log(args) },
+                             "task '{}' has overflowed its stack", n);
+            }
+        } else {
+            rterrln!("stack overflow in non-task context");
+        }
+
+        intrinsics::abort();
+    }
+}
+
 /// This is the entry point of unwinding for things like lang items and such.
 /// The arguments are normally generated by the compiler.
 pub fn begin_unwind(msg: *c_char, file: *c_char, line: size_t) -> ! {
@@ -481,22 +556,33 @@ pub fn begin_unwind(msg: *c_char, file: *c_char, line: size_t) -> ! {
         let msg = match msg.as_str() {
             Some(s) => s, None => rtabort!("message wasn't utf8?")
         };
-        let file = match file.as_str() {
-            Some(s) => s, None => rtabort!("message wasn't utf8?")
-        };
 
         if in_green_task_context() {
             // Be careful not to allocate in this block, if we're failing we may
             // have been failing due to a lack of memory in the first place...
             do Local::borrow |task: &mut Task| {
                 let n = task.name.as_ref().map(|n| n.as_slice()).unwrap_or("<unnamed>");
-                format_args!(|args| { task.logger.log(args) },
-                             "task '{}' failed at '{}', {}:{}",
-                             n, msg, file, line);
+
+                match file.as_str() {
+                    Some(file) => {
+                        format_args!(|args| { task.logger.log(args) },
+                                     "task '{}' failed at '{}', {}:{}",
+                                     n, msg, file, line);
+                    }
+                    None => {
+                        format_args!(|args| { task.logger.log(args) },
+                                     "task '{}' failed at '{}'", n, msg);
+                    }
+                }
             }
         } else {
-            rterrln!("failed in non-task context at '{}', {}:{}",
-                     msg, file, line as int);
+            match file.as_str() {
+                Some(file) => {
+                    rterrln!("failed in non-task context at '{}', {}:{}",
+                             msg, file, line as int);
+                }
+                None => rterrln!("failed in non-task context at '{}'", msg),
+            }
         }
 
         let task: *mut Task = Local::unsafe_borrow();
diff --git a/src/libstd/rt/thread.rs b/src/libstd/rt/thread.rs
index 8b64fda2136..e774b81da35 100644
--- a/src/libstd/rt/thread.rs
+++ b/src/libstd/rt/thread.rs
@@ -8,8 +8,11 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use cast;
 use libc;
 use ops::Drop;
+use unstable::raw;
+use uint;
 
 #[allow(non_camel_case_types)] // runtime type
 type raw_thread = libc::c_void;
@@ -17,21 +20,38 @@ type raw_thread = libc::c_void;
 pub struct Thread {
     main: ~fn(),
     raw_thread: *raw_thread,
-    joined: bool
+    joined: bool,
 }
 
 impl Thread {
+    #[fixed_stack_segment] #[inline(never)]
     pub fn start(main: ~fn()) -> Thread {
-        fn substart(main: &~fn()) -> *raw_thread {
-            #[fixed_stack_segment]; #[inline(never)];
-
-            unsafe { rust_raw_thread_start(main) }
+        // This is the starting point of rust os threads. The first thing we do
+        // is make sure that we don't trigger __morestack (also why this has a
+        // no_split_stack annotation), and then we re-build the main function
+        // and invoke it from there.
+        #[no_split_stack]
+        extern "C" fn thread_start(code: *(), env: *()) {
+            use rt::context;
+            unsafe {
+                context::record_stack_bounds(0, uint::max_value);
+                let f: &fn() = cast::transmute(raw::Closure {
+                    code: code,
+                    env: env,
+                });
+                f();
+            }
         }
-        let raw = substart(&main);
+
+        let raw_thread = unsafe {
+            let c: raw::Closure = cast::transmute_copy(&main);
+            let raw::Closure { code, env } = c;
+            rust_raw_thread_start(thread_start, code, env)
+        };
         Thread {
             main: main,
-            raw_thread: raw,
-            joined: false
+            raw_thread: raw_thread,
+            joined: false,
         }
     }
 
@@ -55,7 +75,8 @@ impl Drop for Thread {
 }
 
 extern {
-    pub fn rust_raw_thread_start(f: &(~fn())) -> *raw_thread;
-    pub fn rust_raw_thread_join(thread: *raw_thread);
-    pub fn rust_raw_thread_delete(thread: *raw_thread);
+    fn rust_raw_thread_start(f: extern "C" fn(*(), *()),
+                             code: *(), env: *()) -> *raw_thread;
+    fn rust_raw_thread_join(thread: *raw_thread);
+    fn rust_raw_thread_delete(thread: *raw_thread);
 }
diff --git a/src/libstd/rt/thread_local_storage.rs b/src/libstd/rt/thread_local_storage.rs
index cd89d09ffc0..ddb104240f2 100644
--- a/src/libstd/rt/thread_local_storage.rs
+++ b/src/libstd/rt/thread_local_storage.rs
@@ -27,15 +27,11 @@ pub unsafe fn create(key: &mut Key) {
 }
 
 #[cfg(unix)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn set(key: Key, value: *mut c_void) {
     assert_eq!(0, pthread_setspecific(key, value));
 }
 
 #[cfg(unix)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn get(key: Key) -> *mut c_void {
     pthread_getspecific(key)
 }
@@ -53,8 +49,21 @@ type pthread_key_t = ::libc::c_uint;
 #[cfg(unix)]
 extern {
     fn pthread_key_create(key: *mut pthread_key_t, dtor: *u8) -> c_int;
-    fn pthread_setspecific(key: pthread_key_t, value: *mut c_void) -> c_int;
+
+    // This function is a very cheap operation on both osx and unix. On osx, it
+    // turns out it's just three instructions, and on unix it's a cheap function
+    // which only uses a very small amount of stack.
+    //
+    // This is not marked as such because we think it has a small stack, but
+    // rather we would like to be able to fetch information from
+    // thread-local-storage when a task is running very low on its stack budget.
+    // For example, this is invoked whenever stack overflow is detected, and we
+    // obviously have very little budget to deal with (certainly not anything
+    // close to a fixed_stack_segment)
+    #[rust_stack]
     fn pthread_getspecific(key: pthread_key_t) -> *mut c_void;
+    #[rust_stack]
+    fn pthread_setspecific(key: pthread_key_t, value: *mut c_void) -> c_int;
 }
 
 #[cfg(windows)]
@@ -70,31 +79,37 @@ pub unsafe fn create(key: &mut Key) {
 }
 
 #[cfg(windows)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn set(key: Key, value: *mut c_void) {
     assert!(0 != TlsSetValue(key, value))
 }
 
 #[cfg(windows)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn get(key: Key) -> *mut c_void {
     TlsGetValue(key)
 }
 
 #[cfg(windows, target_arch = "x86")]
 extern "stdcall" {
-       fn TlsAlloc() -> DWORD;
-       fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
-       fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    fn TlsAlloc() -> DWORD;
+
+    // See the reasoning in pthread_getspecific as to why this has the
+    // 'rust_stack' attribute, as this function was also verified to only
+    // require a small amount of stack.
+    #[rust_stack]
+    fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    #[rust_stack]
+    fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
 }
 
 #[cfg(windows, target_arch = "x86_64")]
 extern {
-       fn TlsAlloc() -> DWORD;
-       fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
-       fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    fn TlsAlloc() -> DWORD;
+
+    // See above.
+    #[rust_stack]
+    fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    #[rust_stack]
+    fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
 }
 
 #[test]