7 files changed, 412 insertions, 67 deletions
diff --git a/src/libstd/rt/context.rs b/src/libstd/rt/context.rs
index 222f9a44b17..7f7545ca230 100644
--- a/src/libstd/rt/context.rs
+++ b/src/libstd/rt/context.rs
@@ -11,9 +11,12 @@
 use option::*;
 use super::stack::StackSegment;
 use libc::c_void;
+use uint;
 use cast::{transmute, transmute_mut_unsafe,
            transmute_region, transmute_mut_region};
 
+pub static RED_ZONE: uint = 20 * 1024;
+
 // FIXME #7761: Registers is boxed so that it is 16-byte aligned, for storing
 // SSE regs.  It would be marginally better not to do this. In C++ we
 // use an attribute on a struct.
@@ -24,14 +27,17 @@ pub struct Context {
     /// The context entry point, saved here for later destruction
     start: Option<~~fn()>,
     /// Hold the registers while the task or scheduler is suspended
-    regs: ~Registers
+    regs: ~Registers,
+    /// Lower bound and upper bound for the stack
+    stack_bounds: Option<(uint, uint)>,
 }
 
 impl Context {
     pub fn empty() -> Context {
         Context {
             start: None,
-            regs: new_regs()
+            regs: new_regs(),
+            stack_bounds: None,
         }
     }
 
@@ -47,7 +53,6 @@ impl Context {
 
         let fp: *c_void = task_start_wrapper as *c_void;
         let argp: *c_void = unsafe { transmute::<&~fn(), *c_void>(&*start) };
-        let stack_base: *uint = stack.start();
         let sp: *uint = stack.end();
         let sp: *mut uint = unsafe { transmute_mut_unsafe(sp) };
         // Save and then immediately load the current context,
@@ -57,11 +62,23 @@ impl Context {
             swap_registers(transmute_mut_region(&mut *regs), transmute_region(&*regs));
         };
 
-        initialize_call_frame(&mut *regs, fp, argp, sp, stack_base);
+        initialize_call_frame(&mut *regs, fp, argp, sp);
 
+        // Scheduler tasks don't have a stack in the "we allocated it" sense,
+        // but rather they run on pthreads stacks. We have complete control over
+        // them in terms of the code running on them (and hopefully they don't
+        // overflow). Additionally, their coroutine stacks are listed as being
+        // zero-length, so that's how we detect what's what here.
+        let stack_base: *uint = stack.start();
+        let bounds = if sp as uint == stack_base as uint {
+            None
+        } else {
+            Some((stack_base as uint, sp as uint))
+        };
         return Context {
             start: Some(start),
-            regs: regs
+            regs: regs,
+            stack_bounds: bounds,
         }
     }
 
@@ -79,8 +96,25 @@ impl Context {
         let in_regs: &Registers = match in_context {
             &Context { regs: ~ref r, _ } => r
         };
-        rtdebug!("doing raw swap");
-        unsafe { swap_registers(out_regs, in_regs) };
+
+        rtdebug!("noting the stack limit and doing raw swap");
+
+        unsafe {
+            // Right before we switch to the new context, set the new context's
+            // stack limit in the OS-specified TLS slot. This also  means that
+            // we cannot call any more rust functions after record_stack_bounds
+            // returns because they would all likely fail due to the limit being
+            // invalid for the current task. Lucky for us `swap_registers` is a
+            // C function so we don't have to worry about that!
+            match in_context.stack_bounds {
+                Some((lo, hi)) => record_stack_bounds(lo, hi),
+                // If we're going back to one of the original contexts or
+                // something that's possibly not a "normal task", then reset
+                // the stack limit to 0 to make morestack never fail
+                None => record_stack_bounds(0, uint::max_value),
+            }
+            swap_registers(out_regs, in_regs)
+        }
     }
 }
 
@@ -89,6 +123,29 @@ extern {
     fn swap_registers(out_regs: *mut Registers, in_regs: *Registers);
 }
 
+// Register contexts used in various architectures
+//
+// These structures all represent a context of one task throughout its
+// execution. Each struct is a representation of the architecture's register
+// set. When swapping between tasks, these register sets are used to save off
+// the current registers into one struct, and load them all from another.
+//
+// Note that this is only used for context switching, which means that some of
+// the registers may go unused. For example, for architectures with
+// callee/caller saved registers, the context will only reflect the callee-saved
+// registers. This is because the caller saved registers are already stored
+// elsewhere on the stack (if it was necessary anyway).
+//
+// Additionally, there may be fields on various architectures which are unused
+// entirely because they only reflect what is theoretically possible for a
+// "complete register set" to show, but user-space cannot alter these registers.
+// An example of this would be the segment selectors for x86.
+//
+// These structures/functions are roughly in-sync with the source files inside
+// of src/rt/arch/$arch. The only currently used function from those folders is
+// the `swap_registers` function, but that's only because for now segmented
+// stacks are disabled.
+
 #[cfg(target_arch = "x86")]
 struct Registers {
     eax: u32, ebx: u32, ecx: u32, edx: u32,
@@ -109,7 +166,7 @@ fn new_regs() -> ~Registers {
 
 #[cfg(target_arch = "x86")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
 
     let sp = align_down(sp);
     let sp = mut_offset(sp, -4);
@@ -125,6 +182,8 @@ fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
     regs.ebp = 0;
 }
 
+// windows requires saving more registers (both general and XMM), so the windows
+// register context must be larger.
 #[cfg(windows, target_arch = "x86_64")]
 type Registers = [uint, ..34];
 #[cfg(not(windows), target_arch = "x86_64")]
@@ -137,29 +196,14 @@ fn new_regs() -> ~Registers { ~([0, .. 22]) }
 
 #[cfg(target_arch = "x86_64")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, stack_base: *uint) {
+                         sp: *mut uint) {
 
-    // Redefinitions from regs.h
+    // Redefinitions from rt/arch/x86_64/regs.h
     static RUSTRT_ARG0: uint = 3;
     static RUSTRT_RSP: uint = 1;
     static RUSTRT_IP: uint = 8;
     static RUSTRT_RBP: uint = 2;
 
-    #[cfg(windows)]
-    fn initialize_tib(regs: &mut Registers, sp: *mut uint, stack_base: *uint) {
-        // Redefinitions from regs.h
-        static RUSTRT_ST1: uint = 11; // stack bottom
-        static RUSTRT_ST2: uint = 12; // stack top
-        regs[RUSTRT_ST1] = sp as uint;
-        regs[RUSTRT_ST2] = stack_base as uint;
-    }
-    #[cfg(not(windows))]
-    fn initialize_tib(_: &mut Registers, _: *mut uint, _: *uint) {
-    }
-
-    // Win64 manages stack range at TIB: %gs:0x08 (top) and %gs:0x10 (bottom)
-    initialize_tib(regs, sp, stack_base);
-
     let sp = align_down(sp);
     let sp = mut_offset(sp, -1);
 
@@ -167,9 +211,9 @@ fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
     unsafe { *sp = 0; }
 
     rtdebug!("creating call frame");
-    rtdebug!("fptr {}", fptr as uint);
-    rtdebug!("arg {}", arg as uint);
-    rtdebug!("sp {}", sp as uint);
+    rtdebug!("fptr {}", fptr);
+    rtdebug!("arg {}", arg);
+    rtdebug!("sp {}", sp);
 
     regs[RUSTRT_ARG0] = arg as uint;
     regs[RUSTRT_RSP] = sp as uint;
@@ -187,7 +231,7 @@ fn new_regs() -> ~Registers { ~([0, .. 32]) }
 
 #[cfg(target_arch = "arm")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
     let sp = align_down(sp);
     // sp of arm eabi is 8-byte aligned
     let sp = mut_offset(sp, -2);
@@ -208,7 +252,7 @@ fn new_regs() -> ~Registers { ~([0, .. 32]) }
 
 #[cfg(target_arch = "mips")]
 fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
-                         sp: *mut uint, _stack_base: *uint) {
+                         sp: *mut uint) {
     let sp = align_down(sp);
     // sp of mips o32 is 8-byte aligned
     let sp = mut_offset(sp, -2);
@@ -236,3 +280,182 @@ pub fn mut_offset<T>(ptr: *mut T, count: int) -> *mut T {
     use mem::size_of;
     (ptr as int + count * (size_of::<T>() as int)) as *mut T
 }
+
+#[inline(always)]
+pub unsafe fn record_stack_bounds(stack_lo: uint, stack_hi: uint) {
+    // When the old runtime had segmented stacks, it used a calculation that was
+    // "limit + RED_ZONE + FUDGE". The red zone was for things like dynamic
+    // symbol resolution, llvm function calls, etc. In theory this red zone
+    // value is 0, but it matters far less when we have gigantic stacks because
+    // we don't need to be so exact about our stack budget. The "fudge factor"
+    // was because LLVM doesn't emit a stack check for functions < 256 bytes in
+    // size. Again though, we have giant stacks, so we round all these
+    // calculations up to the nice round number of 20k.
+    record_sp_limit(stack_lo + RED_ZONE);
+
+    return target_record_stack_bounds(stack_lo, stack_hi);
+
+    #[cfg(not(windows))] #[cfg(not(target_arch = "x86_64"))] #[inline(always)]
+    unsafe fn target_record_stack_bounds(_stack_lo: uint, _stack_hi: uint) {}
+    #[cfg(windows, target_arch = "x86_64")] #[inline(always)]
+    unsafe fn target_record_stack_bounds(stack_lo: uint, stack_hi: uint) {
+        // Windows compiles C functions which may check the stack bounds. This
+        // means that if we want to perform valid FFI on windows, then we need
+        // to ensure that the stack bounds are what they truly are for this
+        // task. More info can be found at:
+        //   https://github.com/mozilla/rust/issues/3445#issuecomment-26114839
+        //
+        // stack range is at TIB: %gs:0x08 (top) and %gs:0x10 (bottom)
+        asm!("mov $0, %gs:0x08" :: "r"(stack_lo) :: "volatile");
+        asm!("mov $0, %gs:0x10" :: "r"(stack_hi) :: "volatile");
+    }
+}
+
+/// Records the current limit of the stack as specified by `end`.
+///
+/// This is stored in an OS-dependent location, likely inside of the thread
+/// local storage. The location that the limit is stored is a pre-ordained
+/// location because it's where LLVM has emitted code to check.
+///
+/// Note that this cannot be called under normal circumstances. This function is
+/// changing the stack limit, so upon returning any further function calls will
+/// possibly be triggering the morestack logic if you're not careful.
+///
+/// Also note that this and all of the inside functions are all flagged as
+/// "inline(always)" because they're messing around with the stack limits.  This
+/// would be unfortunate for the functions themselves to trigger a morestack
+/// invocation (if they were an actual function call).
+#[inline(always)]
+pub unsafe fn record_sp_limit(limit: uint) {
+    return target_record_sp_limit(limit);
+
+    // x86-64
+    #[cfg(target_arch = "x86_64", target_os = "macos")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $$0x60+90*8, %rsi
+              movq $0, %gs:(%rsi)" :: "r"(limit) : "rsi" : "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "linux")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $0, %fs:112" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "win32")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        // see: http://en.wikipedia.org/wiki/Win32_Thread_Information_Block
+        // store this inside of the "arbitrary data slot", but double the size
+        // because this is 64 bit instead of 32 bit
+        asm!("movq $0, %gs:0x28" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86_64", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movq $0, %fs:24" :: "r"(limit) :: "volatile")
+    }
+
+    // x86
+    #[cfg(target_arch = "x86", target_os = "macos")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movl $$0x48+90*4, %eax
+              movl $0, %gs:(%eax)" :: "r"(limit) : "eax" : "volatile")
+    }
+    #[cfg(target_arch = "x86", target_os = "linux")]
+    #[cfg(target_arch = "x86", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        asm!("movl $0, %gs:48" :: "r"(limit) :: "volatile")
+    }
+    #[cfg(target_arch = "x86", target_os = "win32")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        // see: http://en.wikipedia.org/wiki/Win32_Thread_Information_Block
+        // store this inside of the "arbitrary data slot"
+        asm!("movl $0, %fs:0x14" :: "r"(limit) :: "volatile")
+    }
+
+    // mips, arm - Some brave soul can port these to inline asm, but it's over
+    //             my head personally
+    #[cfg(target_arch = "mips")]
+    #[cfg(target_arch = "arm")] #[inline(always)]
+    unsafe fn target_record_sp_limit(limit: uint) {
+        return record_sp_limit(limit as *c_void);
+        extern {
+            #[rust_stack]
+            fn record_sp_limit(limit: *c_void);
+        }
+    }
+}
+
+/// The counterpart of the function above, this function will fetch the current
+/// stack limit stored in TLS.
+///
+/// Note that all of these functions are meant to be exact counterparts of their
+/// brethren above, except that the operands are reversed.
+///
+/// As with the setter, this function does not have a __morestack header and can
+/// therefore be called in a "we're out of stack" situation.
+#[inline(always)]
+// NOTE: after the next snapshot, can remove the initialization before inline
+//       assembly due to an improvement in how it's handled, then this specific
+//       allow directive should get removed.
+#[allow(dead_assignment)]
+pub unsafe fn get_sp_limit() -> uint {
+    return target_get_sp_limit();
+
+    // x86-64
+    #[cfg(target_arch = "x86_64", target_os = "macos")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq $$0x60+90*8, %rsi
+              movq %gs:(%rsi), $0" : "=r"(limit) :: "rsi" : "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "linux")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %fs:112, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "win32")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %gs:0x28, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86_64", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movq %fs:24, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+
+    // x86
+    #[cfg(target_arch = "x86", target_os = "macos")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl $$0x48+90*4, %eax
+              movl %gs:(%eax), $0" : "=r"(limit) :: "eax" : "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86", target_os = "linux")]
+    #[cfg(target_arch = "x86", target_os = "freebsd")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl %gs:48, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+    #[cfg(target_arch = "x86", target_os = "win32")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        let mut limit: uint = 0;
+        asm!("movl %fs:0x14, $0" : "=r"(limit) ::: "volatile");
+        return limit;
+    }
+
+    // mips, arm - Some brave soul can port these to inline asm, but it's over
+    //             my head personally
+    #[cfg(target_arch = "mips")]
+    #[cfg(target_arch = "arm")] #[inline(always)]
+    unsafe fn target_get_sp_limit() -> uint {
+        return get_sp_limit() as uint;
+        extern {
+            #[rust_stack]
+            fn get_sp_limit() -> *c_void;
+        }
+    }
+}
diff --git a/src/libstd/rt/crate_map.rs b/src/libstd/rt/crate_map.rs
index 8785dcca7bd..96a0069e851 100644
--- a/src/libstd/rt/crate_map.rs
+++ b/src/libstd/rt/crate_map.rs
@@ -17,7 +17,7 @@ use vec::ImmutableVector;
 // and instead look them up at runtime, which we need to resolve
 // the crate_map properly.
 #[cfg(target_os = "macos")]
-#[link_args = "-undefined dynamic_lookup"]
+#[link_args = "-Wl,-U,__rust_crate_map_toplevel"]
 extern {}
 
 pub struct ModEntry<'self> {
diff --git a/src/libstd/rt/env.rs b/src/libstd/rt/env.rs
index 5b840655120..c02e7fe9013 100644
--- a/src/libstd/rt/env.rs
+++ b/src/libstd/rt/env.rs
@@ -17,7 +17,7 @@ use os;
 // Note that these are all accessed without any synchronization.
 // They are expected to be initialized once then left alone.
 
-static mut MIN_STACK: uint = 2000000;
+static mut MIN_STACK: uint = 4000000;
 static mut DEBUG_BORROW: bool = false;
 
 pub fn init() {
diff --git a/src/libstd/rt/sched.rs b/src/libstd/rt/sched.rs
index 7724f58153e..93ac308df3a 100644
--- a/src/libstd/rt/sched.rs
+++ b/src/libstd/rt/sched.rs
@@ -173,7 +173,7 @@ impl Scheduler {
 
         // Now that we have an empty task struct for the scheduler
         // task, put it in TLS.
-        Local::put::(sched_task);
+        Local::put(sched_task);
 
         // Before starting our first task, make sure the idle callback
         // is active. As we do not start in the sleep state this is
diff --git a/src/libstd/rt/task.rs b/src/libstd/rt/task.rs
index 889d9bb3156..28c38ac9b53 100644
--- a/src/libstd/rt/task.rs
+++ b/src/libstd/rt/task.rs
@@ -29,6 +29,7 @@ use rt::logging::StdErrLogger;
 use super::local_heap::LocalHeap;
 use rt::sched::{Scheduler, SchedHandle};
 use rt::stack::{StackSegment, StackPool};
+use rt::context;
 use rt::context::Context;
 use unstable::finally::Finally;
 use task::spawn::Taskgroup;
@@ -465,6 +466,80 @@ impl Unwinder {
     }
 }
 
+/// This function is invoked from rust's current __morestack function. Segmented
+/// stacks are currently not enabled as segmented stacks, but rather one giant
+/// stack segment. This means that whenever we run out of stack, we want to
+/// truly consider it to be stack overflow rather than allocating a new stack.
+#[no_mangle]      // - this is called from C code
+#[no_split_stack] // - it would be sad for this function to trigger __morestack
+#[doc(hidden)] // XXX: this function shouldn't have to be `pub` to get exported
+               //      so it can be linked against, we should have a better way
+               //      of specifying that.
+pub extern "C" fn rust_stack_exhausted() {
+    use rt::in_green_task_context;
+    use rt::task::Task;
+    use rt::local::Local;
+    use rt::logging::Logger;
+    use unstable::intrinsics;
+
+    unsafe {
+        // We're calling this function because the stack just ran out. We need
+        // to call some other rust functions, but if we invoke the functions
+        // right now it'll just trigger this handler being called again. In
+        // order to alleviate this, we move the stack limit to be inside of the
+        // red zone that was allocated for exactly this reason.
+        let limit = context::get_sp_limit();
+        context::record_sp_limit(limit - context::RED_ZONE / 2);
+
+        // This probably isn't the best course of action. Ideally one would want
+        // to unwind the stack here instead of just aborting the entire process.
+        // This is a tricky problem, however. There's a few things which need to
+        // be considered:
+        //
+        //  1. We're here because of a stack overflow, yet unwinding will run
+        //     destructors and hence arbitrary code. What if that code overflows
+        //     the stack? One possibility is to use the above allocation of an
+        //     extra 10k to hope that we don't hit the limit, and if we do then
+        //     abort the whole program. Not the best, but kind of hard to deal
+        //     with unless we want to switch stacks.
+        //
+        //  2. LLVM will optimize functions based on whether they can unwind or
+        //     not. It will flag functions with 'nounwind' if it believes that
+        //     the function cannot trigger unwinding, but if we do unwind on
+        //     stack overflow then it means that we could unwind in any function
+        //     anywhere. We would have to make sure that LLVM only places the
+        //     nounwind flag on functions which don't call any other functions.
+        //
+        //  3. The function that overflowed may have owned arguments. These
+        //     arguments need to have their destructors run, but we haven't even
+        //     begun executing the function yet, so unwinding will not run the
+        //     any landing pads for these functions. If this is ignored, then
+        //     the arguments will just be leaked.
+        //
+        // Exactly what to do here is a very delicate topic, and is possibly
+        // still up in the air for what exactly to do. Some relevant issues:
+        //
+        //  #3555 - out-of-stack failure leaks arguments
+        //  #3695 - should there be a stack limit?
+        //  #9855 - possible strategies which could be taken
+        //  #9854 - unwinding on windows through __morestack has never worked
+        //  #2361 - possible implementation of not using landing pads
+
+        if in_green_task_context() {
+            do Local::borrow |task: &mut Task| {
+                let n = task.name.as_ref().map(|n| n.as_slice()).unwrap_or("<unnamed>");
+
+                format_args!(|args| { task.logger.log(args) },
+                             "task '{}' has overflowed its stack", n);
+            }
+        } else {
+            rterrln!("stack overflow in non-task context");
+        }
+
+        intrinsics::abort();
+    }
+}
+
 /// This is the entry point of unwinding for things like lang items and such.
 /// The arguments are normally generated by the compiler.
 pub fn begin_unwind(msg: *c_char, file: *c_char, line: size_t) -> ! {
@@ -481,22 +556,33 @@ pub fn begin_unwind(msg: *c_char, file: *c_char, line: size_t) -> ! {
         let msg = match msg.as_str() {
             Some(s) => s, None => rtabort!("message wasn't utf8?")
         };
-        let file = match file.as_str() {
-            Some(s) => s, None => rtabort!("message wasn't utf8?")
-        };
 
         if in_green_task_context() {
             // Be careful not to allocate in this block, if we're failing we may
             // have been failing due to a lack of memory in the first place...
             do Local::borrow |task: &mut Task| {
                 let n = task.name.as_ref().map(|n| n.as_slice()).unwrap_or("<unnamed>");
-                format_args!(|args| { task.logger.log(args) },
-                             "task '{}' failed at '{}', {}:{}",
-                             n, msg, file, line);
+
+                match file.as_str() {
+                    Some(file) => {
+                        format_args!(|args| { task.logger.log(args) },
+                                     "task '{}' failed at '{}', {}:{}",
+                                     n, msg, file, line);
+                    }
+                    None => {
+                        format_args!(|args| { task.logger.log(args) },
+                                     "task '{}' failed at '{}'", n, msg);
+                    }
+                }
             }
         } else {
-            rterrln!("failed in non-task context at '{}', {}:{}",
-                     msg, file, line as int);
+            match file.as_str() {
+                Some(file) => {
+                    rterrln!("failed in non-task context at '{}', {}:{}",
+                             msg, file, line as int);
+                }
+                None => rterrln!("failed in non-task context at '{}'", msg),
+            }
         }
 
         let task: *mut Task = Local::unsafe_borrow();
diff --git a/src/libstd/rt/thread.rs b/src/libstd/rt/thread.rs
index 8b64fda2136..e774b81da35 100644
--- a/src/libstd/rt/thread.rs
+++ b/src/libstd/rt/thread.rs
@@ -8,8 +8,11 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use cast;
 use libc;
 use ops::Drop;
+use unstable::raw;
+use uint;
 
 #[allow(non_camel_case_types)] // runtime type
 type raw_thread = libc::c_void;
@@ -17,21 +20,38 @@ type raw_thread = libc::c_void;
 pub struct Thread {
     main: ~fn(),
     raw_thread: *raw_thread,
-    joined: bool
+    joined: bool,
 }
 
 impl Thread {
+    #[fixed_stack_segment] #[inline(never)]
     pub fn start(main: ~fn()) -> Thread {
-        fn substart(main: &~fn()) -> *raw_thread {
-            #[fixed_stack_segment]; #[inline(never)];
-
-            unsafe { rust_raw_thread_start(main) }
+        // This is the starting point of rust os threads. The first thing we do
+        // is make sure that we don't trigger __morestack (also why this has a
+        // no_split_stack annotation), and then we re-build the main function
+        // and invoke it from there.
+        #[no_split_stack]
+        extern "C" fn thread_start(code: *(), env: *()) {
+            use rt::context;
+            unsafe {
+                context::record_stack_bounds(0, uint::max_value);
+                let f: &fn() = cast::transmute(raw::Closure {
+                    code: code,
+                    env: env,
+                });
+                f();
+            }
         }
-        let raw = substart(&main);
+
+        let raw_thread = unsafe {
+            let c: raw::Closure = cast::transmute_copy(&main);
+            let raw::Closure { code, env } = c;
+            rust_raw_thread_start(thread_start, code, env)
+        };
         Thread {
             main: main,
-            raw_thread: raw,
-            joined: false
+            raw_thread: raw_thread,
+            joined: false,
         }
     }
 
@@ -55,7 +75,8 @@ impl Drop for Thread {
 }
 
 extern {
-    pub fn rust_raw_thread_start(f: &(~fn())) -> *raw_thread;
-    pub fn rust_raw_thread_join(thread: *raw_thread);
-    pub fn rust_raw_thread_delete(thread: *raw_thread);
+    fn rust_raw_thread_start(f: extern "C" fn(*(), *()),
+                             code: *(), env: *()) -> *raw_thread;
+    fn rust_raw_thread_join(thread: *raw_thread);
+    fn rust_raw_thread_delete(thread: *raw_thread);
 }
diff --git a/src/libstd/rt/thread_local_storage.rs b/src/libstd/rt/thread_local_storage.rs
index cd89d09ffc0..ddb104240f2 100644
--- a/src/libstd/rt/thread_local_storage.rs
+++ b/src/libstd/rt/thread_local_storage.rs
@@ -27,15 +27,11 @@ pub unsafe fn create(key: &mut Key) {
 }
 
 #[cfg(unix)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn set(key: Key, value: *mut c_void) {
     assert_eq!(0, pthread_setspecific(key, value));
 }
 
 #[cfg(unix)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn get(key: Key) -> *mut c_void {
     pthread_getspecific(key)
 }
@@ -53,8 +49,21 @@ type pthread_key_t = ::libc::c_uint;
 #[cfg(unix)]
 extern {
     fn pthread_key_create(key: *mut pthread_key_t, dtor: *u8) -> c_int;
-    fn pthread_setspecific(key: pthread_key_t, value: *mut c_void) -> c_int;
+
+    // This function is a very cheap operation on both osx and unix. On osx, it
+    // turns out it's just three instructions, and on unix it's a cheap function
+    // which only uses a very small amount of stack.
+    //
+    // This is not marked as such because we think it has a small stack, but
+    // rather we would like to be able to fetch information from
+    // thread-local-storage when a task is running very low on its stack budget.
+    // For example, this is invoked whenever stack overflow is detected, and we
+    // obviously have very little budget to deal with (certainly not anything
+    // close to a fixed_stack_segment)
+    #[rust_stack]
     fn pthread_getspecific(key: pthread_key_t) -> *mut c_void;
+    #[rust_stack]
+    fn pthread_setspecific(key: pthread_key_t, value: *mut c_void) -> c_int;
 }
 
 #[cfg(windows)]
@@ -70,31 +79,37 @@ pub unsafe fn create(key: &mut Key) {
 }
 
 #[cfg(windows)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn set(key: Key, value: *mut c_void) {
     assert!(0 != TlsSetValue(key, value))
 }
 
 #[cfg(windows)]
-#[fixed_stack_segment]
-#[inline(never)]
 pub unsafe fn get(key: Key) -> *mut c_void {
     TlsGetValue(key)
 }
 
 #[cfg(windows, target_arch = "x86")]
 extern "stdcall" {
-       fn TlsAlloc() -> DWORD;
-       fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
-       fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    fn TlsAlloc() -> DWORD;
+
+    // See the reasoning in pthread_getspecific as to why this has the
+    // 'rust_stack' attribute, as this function was also verified to only
+    // require a small amount of stack.
+    #[rust_stack]
+    fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    #[rust_stack]
+    fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
 }
 
 #[cfg(windows, target_arch = "x86_64")]
 extern {
-       fn TlsAlloc() -> DWORD;
-       fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
-       fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    fn TlsAlloc() -> DWORD;
+
+    // See above.
+    #[rust_stack]
+    fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
+    #[rust_stack]
+    fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
 }
 
 #[test]