13 files changed, 803 insertions, 430 deletions
diff --git a/src/librustc/session/config.rs b/src/librustc/session/config.rs
index 153d8c3a152..7c1d457a6ee 100644
--- a/src/librustc/session/config.rs
+++ b/src/librustc/session/config.rs
@@ -1552,24 +1552,6 @@ pub fn build_session_options_and_crate_config(matches: &getopts::Matches)
         early_error(error_format, "Value for codegen units must be a positive nonzero integer");
     }
 
-    // It's possible that we have `codegen_units > 1` but only one item in
-    // `trans.modules`.  We could theoretically proceed and do LTO in that
-    // case, but it would be confusing to have the validity of
-    // `-Z lto -C codegen-units=2` depend on details of the crate being
-    // compiled, so we complain regardless.
-    if cg.lto {
-        if let Some(n) = codegen_units {
-            if n > 1 {
-                // This case is impossible to handle because LTO expects to be able
-                // to combine the entire crate and all its dependencies into a
-                // single compilation unit, but each codegen unit is in a separate
-                // LLVM context, so they can't easily be combined.
-                early_error(error_format, "can't perform LTO when using multiple codegen units");
-            }
-        }
-        codegen_units = Some(1);
-    }
-
     if cg.lto && debugging_opts.incremental.is_some() {
         early_error(error_format, "can't perform LTO when compiling incrementally");
     }
diff --git a/src/librustc_llvm/ffi.rs b/src/librustc_llvm/ffi.rs
index d91c706e24e..fd4a136f50b 100644
--- a/src/librustc_llvm/ffi.rs
+++ b/src/librustc_llvm/ffi.rs
@@ -478,6 +478,7 @@ pub mod debuginfo {
     }
 }
 
+pub enum ModuleBuffer {}
 
 // Link to our native llvm bindings (things that we need to use the C++ api
 // for) and because llvm is written in C++ we need to link against libstdc++
@@ -1609,6 +1610,7 @@ extern "C" {
     pub fn LLVMRustSetNormalizedTarget(M: ModuleRef, triple: *const c_char);
     pub fn LLVMRustAddAlwaysInlinePass(P: PassManagerBuilderRef, AddLifetimes: bool);
     pub fn LLVMRustLinkInExternalBitcode(M: ModuleRef, bc: *const c_char, len: size_t) -> bool;
+    pub fn LLVMRustLinkInParsedExternalBitcode(M: ModuleRef, M: ModuleRef) -> bool;
     pub fn LLVMRustRunRestrictionPass(M: ModuleRef, syms: *const *const c_char, len: size_t);
     pub fn LLVMRustMarkAllFunctionsNounwind(M: ModuleRef);
 
@@ -1678,4 +1680,9 @@ extern "C" {
     pub fn LLVMRustSetComdat(M: ModuleRef, V: ValueRef, Name: *const c_char);
     pub fn LLVMRustUnsetComdat(V: ValueRef);
     pub fn LLVMRustSetModulePIELevel(M: ModuleRef);
+    pub fn LLVMRustModuleBufferCreate(M: ModuleRef) -> *mut ModuleBuffer;
+    pub fn LLVMRustModuleBufferPtr(p: *const ModuleBuffer) -> *const u8;
+    pub fn LLVMRustModuleBufferLen(p: *const ModuleBuffer) -> usize;
+    pub fn LLVMRustModuleBufferFree(p: *mut ModuleBuffer);
+    pub fn LLVMRustModuleCost(M: ModuleRef) -> u64;
 }
diff --git a/src/librustc_trans/back/archive.rs b/src/librustc_trans/back/archive.rs
index 0d39db9e10a..179ef20b19f 100644
--- a/src/librustc_trans/back/archive.rs
+++ b/src/librustc_trans/back/archive.rs
@@ -17,6 +17,7 @@ use std::path::{Path, PathBuf};
 use std::ptr;
 use std::str;
 
+use back::bytecode::RLIB_BYTECODE_EXTENSION;
 use libc;
 use llvm::archive_ro::{ArchiveRO, Child};
 use llvm::{self, ArchiveKind};
@@ -154,12 +155,9 @@ impl<'a> ArchiveBuilder<'a> {
         // might be also an extra name suffix
         let obj_start = format!("{}", name);
 
-        // Ignoring all bytecode files, no matter of
-        // name
-        let bc_ext = ".bytecode.deflate";
-
         self.add_archive(rlib, move |fname: &str| {
-            if fname.ends_with(bc_ext) || fname == METADATA_FILENAME {
+            // Ignore bytecode/metadata files, no matter the name.
+            if fname.ends_with(RLIB_BYTECODE_EXTENSION) || fname == METADATA_FILENAME {
                 return true
             }
 
diff --git a/src/librustc_trans/back/bytecode.rs b/src/librustc_trans/back/bytecode.rs
new file mode 100644
index 00000000000..55c96322a95
--- /dev/null
+++ b/src/librustc_trans/back/bytecode.rs
@@ -0,0 +1,160 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Management of the encoding of LLVM bytecode into rlibs
+//!
+//! This module contains the management of encoding LLVM bytecode into rlibs,
+//! primarily for the usage in LTO situations. Currently the compiler will
+//! unconditionally encode LLVM-IR into rlibs regardless of what's happening
+//! elsewhere, so we currently compress the bytecode via deflate to avoid taking
+//! up too much space on disk.
+//!
+//! After compressing the bytecode we then have the rest of the format to
+//! basically deal with various bugs in various archive implementations. The
+//! format currently is:
+//!
+//!     RLIB LLVM-BYTECODE OBJECT LAYOUT
+//!     Version 2
+//!     Bytes    Data
+//!     0..10    "RUST_OBJECT" encoded in ASCII
+//!     11..14   format version as little-endian u32
+//!     15..19   the length of the module identifier string
+//!     20..n    the module identifier string
+//!     n..n+8   size in bytes of deflate compressed LLVM bitcode as
+//!              little-endian u64
+//!     n+9..    compressed LLVM bitcode
+//!     ?        maybe a byte to make this whole thing even length
+
+use std::io::{Read, Write};
+use std::ptr;
+use std::str;
+
+use flate2::Compression;
+use flate2::read::DeflateDecoder;
+use flate2::write::DeflateEncoder;
+
+// This is the "magic number" expected at the beginning of a LLVM bytecode
+// object in an rlib.
+pub const RLIB_BYTECODE_OBJECT_MAGIC: &'static [u8] = b"RUST_OBJECT";
+
+// The version number this compiler will write to bytecode objects in rlibs
+pub const RLIB_BYTECODE_OBJECT_VERSION: u8 = 2;
+
+pub const RLIB_BYTECODE_EXTENSION: &str = "bytecode.encoded";
+
+pub fn encode(identifier: &str, bytecode: &[u8]) -> Vec<u8> {
+    let mut encoded = Vec::new();
+
+    // Start off with the magic string
+    encoded.extend_from_slice(RLIB_BYTECODE_OBJECT_MAGIC);
+
+    // Next up is the version
+    encoded.extend_from_slice(&[RLIB_BYTECODE_OBJECT_VERSION, 0, 0, 0]);
+
+    // Next is the LLVM module identifier length + contents
+    let identifier_len = identifier.len();
+    encoded.extend_from_slice(&[
+        (identifier_len >>  0) as u8,
+        (identifier_len >>  8) as u8,
+        (identifier_len >> 16) as u8,
+        (identifier_len >> 24) as u8,
+    ]);
+    encoded.extend_from_slice(identifier.as_bytes());
+
+    // Next is the LLVM module deflate compressed, prefixed with its length. We
+    // don't know its length yet, so fill in 0s
+    let deflated_size_pos = encoded.len();
+    encoded.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0]);
+
+    let before = encoded.len();
+    DeflateEncoder::new(&mut encoded, Compression::Fast)
+        .write_all(bytecode)
+        .unwrap();
+    let after = encoded.len();
+
+    // Fill in the length we reserved space for before
+    let bytecode_len = (after - before) as u64;
+    encoded[deflated_size_pos + 0] = (bytecode_len >>  0) as u8;
+    encoded[deflated_size_pos + 1] = (bytecode_len >>  8) as u8;
+    encoded[deflated_size_pos + 2] = (bytecode_len >> 16) as u8;
+    encoded[deflated_size_pos + 3] = (bytecode_len >> 24) as u8;
+    encoded[deflated_size_pos + 4] = (bytecode_len >> 32) as u8;
+    encoded[deflated_size_pos + 5] = (bytecode_len >> 40) as u8;
+    encoded[deflated_size_pos + 6] = (bytecode_len >> 48) as u8;
+    encoded[deflated_size_pos + 7] = (bytecode_len >> 56) as u8;
+
+    // If the number of bytes written to the object so far is odd, add a
+    // padding byte to make it even. This works around a crash bug in LLDB
+    // (see issue #15950)
+    if encoded.len() % 2 == 1 {
+        encoded.push(0);
+    }
+
+    return encoded
+}
+
+pub struct DecodedBytecode<'a> {
+    identifier: &'a str,
+    encoded_bytecode: &'a [u8],
+}
+
+impl<'a> DecodedBytecode<'a> {
+    pub fn new(data: &'a [u8]) -> Result<DecodedBytecode<'a>, String> {
+        if !data.starts_with(RLIB_BYTECODE_OBJECT_MAGIC) {
+            return Err(format!("magic bytecode prefix not found"))
+        }
+        let data = &data[RLIB_BYTECODE_OBJECT_MAGIC.len()..];
+        if !data.starts_with(&[RLIB_BYTECODE_OBJECT_VERSION, 0, 0, 0]) {
+            return Err(format!("wrong version prefix found in bytecode"))
+        }
+        let data = &data[4..];
+        if data.len() < 4 {
+            return Err(format!("bytecode corrupted"))
+        }
+        let identifier_len = unsafe {
+            u32::from_le(ptr::read_unaligned(data.as_ptr() as *const u32)) as usize
+        };
+        let data = &data[4..];
+        if data.len() < identifier_len {
+            return Err(format!("bytecode corrupted"))
+        }
+        let identifier = match str::from_utf8(&data[..identifier_len]) {
+            Ok(s) => s,
+            Err(_) => return Err(format!("bytecode corrupted"))
+        };
+        let data = &data[identifier_len..];
+        if data.len() < 8 {
+            return Err(format!("bytecode corrupted"))
+        }
+        let bytecode_len = unsafe {
+            u64::from_le(ptr::read_unaligned(data.as_ptr() as *const u64)) as usize
+        };
+        let data = &data[8..];
+        if data.len() < bytecode_len {
+            return Err(format!("bytecode corrupted"))
+        }
+        let encoded_bytecode = &data[..bytecode_len];
+
+        Ok(DecodedBytecode {
+            identifier,
+            encoded_bytecode,
+        })
+    }
+
+    pub fn bytecode(&self) -> Vec<u8> {
+        let mut data = Vec::new();
+        DeflateDecoder::new(self.encoded_bytecode).read_to_end(&mut data).unwrap();
+        return data
+    }
+
+    pub fn identifier(&self) -> &'a str {
+        self.identifier
+    }
+}
diff --git a/src/librustc_trans/back/link.rs b/src/librustc_trans/back/link.rs
index 619ea59ff1a..3badc1b9a69 100644
--- a/src/librustc_trans/back/link.rs
+++ b/src/librustc_trans/back/link.rs
@@ -9,6 +9,7 @@
 // except according to those terms.
 
 use super::archive::{ArchiveBuilder, ArchiveConfig};
+use super::bytecode::{self, RLIB_BYTECODE_EXTENSION};
 use super::linker::Linker;
 use super::command::Command;
 use super::rpath::RPathConfig;
@@ -36,12 +37,9 @@ use std::ffi::OsString;
 use std::fmt;
 use std::fs::{self, File};
 use std::io::{self, Read, Write, BufWriter};
-use std::mem;
 use std::path::{Path, PathBuf};
 use std::process::{Output, Stdio};
 use std::str;
-use flate2::Compression;
-use flate2::write::DeflateEncoder;
 use syntax::attr;
 
 /// The LLVM module name containing crate-metadata. This includes a `.` on
@@ -55,35 +53,6 @@ pub const METADATA_OBJ_NAME: &'static str = "crate.metadata.o";
 pub const ALLOCATOR_MODULE_NAME: &'static str = "crate.allocator";
 pub const ALLOCATOR_OBJ_NAME: &'static str = "crate.allocator.o";
 
-// RLIB LLVM-BYTECODE OBJECT LAYOUT
-// Version 1
-// Bytes    Data
-// 0..10    "RUST_OBJECT" encoded in ASCII
-// 11..14   format version as little-endian u32
-// 15..22   size in bytes of deflate compressed LLVM bitcode as
-//          little-endian u64
-// 23..     compressed LLVM bitcode
-
-// This is the "magic number" expected at the beginning of a LLVM bytecode
-// object in an rlib.
-pub const RLIB_BYTECODE_OBJECT_MAGIC: &'static [u8] = b"RUST_OBJECT";
-
-// The version number this compiler will write to bytecode objects in rlibs
-pub const RLIB_BYTECODE_OBJECT_VERSION: u32 = 1;
-
-// The offset in bytes the bytecode object format version number can be found at
-pub const RLIB_BYTECODE_OBJECT_VERSION_OFFSET: usize = 11;
-
-// The offset in bytes the size of the compressed bytecode can be found at in
-// format version 1
-pub const RLIB_BYTECODE_OBJECT_V1_DATASIZE_OFFSET: usize =
-    RLIB_BYTECODE_OBJECT_VERSION_OFFSET + 4;
-
-// The offset in bytes the compressed LLVM bytecode can be found at in format
-// version 1
-pub const RLIB_BYTECODE_OBJECT_V1_DATA_OFFSET: usize =
-    RLIB_BYTECODE_OBJECT_V1_DATASIZE_OFFSET + 8;
-
 pub use rustc_trans_utils::link::{find_crate_name, filename_for_input, default_output_for_target,
                                   invalid_output_for_target, build_link_meta, out_filename,
                                   check_file_is_writeable};
@@ -201,8 +170,8 @@ pub fn link_binary(sess: &Session,
     // Remove the temporary object file and metadata if we aren't saving temps
     if !sess.opts.cg.save_temps {
         if sess.opts.output_types.should_trans() {
-            for obj in object_filenames(trans, outputs) {
-                remove(sess, &obj);
+            for obj in trans.modules.iter() {
+                remove(sess, &obj.object);
             }
         }
         remove(sess, &outputs.with_extension(METADATA_OBJ_NAME));
@@ -282,10 +251,8 @@ fn link_binary_output(sess: &Session,
                       crate_type: config::CrateType,
                       outputs: &OutputFilenames,
                       crate_name: &str) -> Vec<PathBuf> {
-    let objects = object_filenames(trans, outputs);
-
-    for file in &objects {
-        check_file_is_writeable(file, sess);
+    for module in trans.modules.iter() {
+        check_file_is_writeable(&module.object, sess);
     }
 
     let tmpdir = match TempDir::new("rustc") {
@@ -308,7 +275,6 @@ fn link_binary_output(sess: &Session,
                 link_rlib(sess,
                           trans,
                           RlibFlavor::Normal,
-                          &objects,
                           outputs,
                           &out_filename,
                           tmpdir.path()).build();
@@ -317,12 +283,11 @@ fn link_binary_output(sess: &Session,
                 link_staticlib(sess,
                                trans,
                                outputs,
-                               &objects,
                                &out_filename,
                                tmpdir.path());
             }
             _ => {
-                link_natively(sess, crate_type, &objects, &out_filename,
+                link_natively(sess, crate_type, &out_filename,
                               trans, outputs, tmpdir.path());
             }
         }
@@ -336,14 +301,6 @@ fn link_binary_output(sess: &Session,
     out_filenames
 }
 
-fn object_filenames(trans: &CrateTranslation,
-                    outputs: &OutputFilenames)
-                    -> Vec<PathBuf> {
-    trans.modules.iter().map(|module| {
-        outputs.temp_path(OutputType::Object, Some(&module.name))
-    }).collect()
-}
-
 fn archive_search_paths(sess: &Session) -> Vec<PathBuf> {
     let mut search = Vec::new();
     sess.target_filesearch(PathKind::Native).for_each_lib_search_path(|path, _| {
@@ -387,15 +344,14 @@ enum RlibFlavor {
 fn link_rlib<'a>(sess: &'a Session,
                  trans: &CrateTranslation,
                  flavor: RlibFlavor,
-                 objects: &[PathBuf],
                  outputs: &OutputFilenames,
                  out_filename: &Path,
                  tmpdir: &Path) -> ArchiveBuilder<'a> {
-    info!("preparing rlib from {:?} to {:?}", objects, out_filename);
+    info!("preparing rlib to {:?}", out_filename);
     let mut ab = ArchiveBuilder::new(archive_config(sess, out_filename, None));
 
-    for obj in objects {
-        ab.add_file(obj);
+    for module in trans.modules.iter() {
+        ab.add_file(&module.object);
     }
 
     // Note that in this loop we are ignoring the value of `lib.cfg`. That is,
@@ -462,15 +418,15 @@ fn link_rlib<'a>(sess: &'a Session,
             // For LTO purposes, the bytecode of this library is also inserted
             // into the archive.  If codegen_units > 1, we insert each of the
             // bitcode files.
-            for obj in objects {
+            for module in trans.modules.iter() {
                 // Note that we make sure that the bytecode filename in the
                 // archive is never exactly 16 bytes long by adding a 16 byte
                 // extension to it. This is to work around a bug in LLDB that
                 // would cause it to crash if the name of a file in an archive
                 // was exactly 16 bytes.
-                let bc_filename = obj.with_extension("bc");
-                let bc_deflated_filename = tmpdir.join({
-                    obj.with_extension("bytecode.deflate").file_name().unwrap()
+                let bc_filename = module.object.with_extension("bc");
+                let bc_encoded_filename = tmpdir.join({
+                    module.object.with_extension(RLIB_BYTECODE_EXTENSION).file_name().unwrap()
                 });
 
                 let mut bc_data = Vec::new();
@@ -482,11 +438,9 @@ fn link_rlib<'a>(sess: &'a Session,
                                                  e))
                 }
 
-                let mut bc_data_deflated = Vec::new();
-                DeflateEncoder::new(&mut bc_data_deflated, Compression::Fast)
-                    .write_all(&bc_data).unwrap();
+                let encoded = bytecode::encode(&module.llmod_id, &bc_data);
 
-                let mut bc_file_deflated = match fs::File::create(&bc_deflated_filename) {
+                let mut bc_file_deflated = match fs::File::create(&bc_encoded_filename) {
                     Ok(file) => file,
                     Err(e) => {
                         sess.fatal(&format!("failed to create compressed \
@@ -494,8 +448,7 @@ fn link_rlib<'a>(sess: &'a Session,
                     }
                 };
 
-                match write_rlib_bytecode_object_v1(&mut bc_file_deflated,
-                                                    &bc_data_deflated) {
+                match bc_file_deflated.write_all(&encoded) {
                     Ok(()) => {}
                     Err(e) => {
                         sess.fatal(&format!("failed to write compressed \
@@ -503,7 +456,7 @@ fn link_rlib<'a>(sess: &'a Session,
                     }
                 };
 
-                ab.add_file(&bc_deflated_filename);
+                ab.add_file(&bc_encoded_filename);
 
                 // See the bottom of back::write::run_passes for an explanation
                 // of when we do and don't keep .#module-name#.bc files around.
@@ -533,40 +486,6 @@ fn link_rlib<'a>(sess: &'a Session,
     ab
 }
 
-fn write_rlib_bytecode_object_v1(writer: &mut Write,
-                                 bc_data_deflated: &[u8]) -> io::Result<()> {
-    let bc_data_deflated_size: u64 = bc_data_deflated.len() as u64;
-
-    writer.write_all(RLIB_BYTECODE_OBJECT_MAGIC)?;
-    writer.write_all(&[1, 0, 0, 0])?;
-    writer.write_all(&[
-        (bc_data_deflated_size >>  0) as u8,
-        (bc_data_deflated_size >>  8) as u8,
-        (bc_data_deflated_size >> 16) as u8,
-        (bc_data_deflated_size >> 24) as u8,
-        (bc_data_deflated_size >> 32) as u8,
-        (bc_data_deflated_size >> 40) as u8,
-        (bc_data_deflated_size >> 48) as u8,
-        (bc_data_deflated_size >> 56) as u8,
-    ])?;
-    writer.write_all(&bc_data_deflated)?;
-
-    let number_of_bytes_written_so_far =
-        RLIB_BYTECODE_OBJECT_MAGIC.len() +                // magic id
-        mem::size_of_val(&RLIB_BYTECODE_OBJECT_VERSION) + // version
-        mem::size_of_val(&bc_data_deflated_size) +        // data size field
-        bc_data_deflated_size as usize;                    // actual data
-
-    // If the number of bytes written to the object so far is odd, add a
-    // padding byte to make it even. This works around a crash bug in LLDB
-    // (see issue #15950)
-    if number_of_bytes_written_so_far % 2 == 1 {
-        writer.write_all(&[0])?;
-    }
-
-    return Ok(());
-}
-
 // Create a static archive
 //
 // This is essentially the same thing as an rlib, but it also involves adding
@@ -582,13 +501,11 @@ fn write_rlib_bytecode_object_v1(writer: &mut Write,
 fn link_staticlib(sess: &Session,
                   trans: &CrateTranslation,
                   outputs: &OutputFilenames,
-                  objects: &[PathBuf],
                   out_filename: &Path,
                   tempdir: &Path) {
     let mut ab = link_rlib(sess,
                            trans,
                            RlibFlavor::StaticlibBase,
-                           objects,
                            outputs,
                            out_filename,
                            tempdir);
@@ -692,12 +609,11 @@ fn print_native_static_libs(sess: &Session, all_native_libs: &[NativeLibrary]) {
 // links to all upstream files as well.
 fn link_natively(sess: &Session,
                  crate_type: config::CrateType,
-                 objects: &[PathBuf],
                  out_filename: &Path,
                  trans: &CrateTranslation,
                  outputs: &OutputFilenames,
                  tmpdir: &Path) {
-    info!("preparing {:?} from {:?} to {:?}", crate_type, objects, out_filename);
+    info!("preparing {:?} to {:?}", crate_type, out_filename);
     let flavor = sess.linker_flavor();
 
     // The invocations of cc share some flags across platforms
@@ -735,7 +651,7 @@ fn link_natively(sess: &Session,
     {
         let mut linker = trans.linker_info.to_linker(cmd, &sess);
         link_args(&mut *linker, sess, crate_type, tmpdir,
-                  objects, out_filename, outputs, trans);
+                  out_filename, outputs, trans);
         cmd = linker.finalize();
     }
     if let Some(args) = sess.target.target.options.late_link_args.get(&flavor) {
@@ -956,7 +872,6 @@ fn link_args(cmd: &mut Linker,
              sess: &Session,
              crate_type: config::CrateType,
              tmpdir: &Path,
-             objects: &[PathBuf],
              out_filename: &Path,
              outputs: &OutputFilenames,
              trans: &CrateTranslation) {
@@ -969,8 +884,8 @@ fn link_args(cmd: &mut Linker,
     let t = &sess.target.target;
 
     cmd.include_path(&fix_windows_verbatim_for_gcc(&lib_path));
-    for obj in objects {
-        cmd.add_object(obj);
+    for module in trans.modules.iter() {
+        cmd.add_object(&module.object);
     }
     cmd.output_filename(out_filename);
 
@@ -1264,7 +1179,7 @@ fn add_upstream_rust_crates(cmd: &mut Linker,
         archive.update_symbols();
 
         for f in archive.src_files() {
-            if f.ends_with("bytecode.deflate") || f == METADATA_FILENAME {
+            if f.ends_with(RLIB_BYTECODE_EXTENSION) || f == METADATA_FILENAME {
                     archive.remove_file(&f);
                     continue
                 }
@@ -1342,7 +1257,7 @@ fn add_upstream_rust_crates(cmd: &mut Linker,
 
             let mut any_objects = false;
             for f in archive.src_files() {
-                if f.ends_with("bytecode.deflate") || f == METADATA_FILENAME {
+                if f.ends_with(RLIB_BYTECODE_EXTENSION) || f == METADATA_FILENAME {
                     archive.remove_file(&f);
                     continue
                 }
diff --git a/src/librustc_trans/back/lto.rs b/src/librustc_trans/back/lto.rs
index aa13e4aa196..8651b95b12a 100644
--- a/src/librustc_trans/back/lto.rs
+++ b/src/librustc_trans/back/lto.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use back::link;
+use back::bytecode::{DecodedBytecode, RLIB_BYTECODE_EXTENSION};
 use back::write;
 use back::symbol_export;
 use rustc::session::config;
@@ -18,17 +18,14 @@ use llvm::archive_ro::ArchiveRO;
 use llvm::{ModuleRef, TargetMachineRef, True, False};
 use rustc::middle::exported_symbols::SymbolExportLevel;
 use rustc::util::common::time;
-use rustc::util::common::path2cstr;
 use rustc::hir::def_id::LOCAL_CRATE;
 use back::write::{ModuleConfig, with_llvm_pmb, CodegenContext};
+use {ModuleTranslation, ModuleKind};
 
 use libc;
-use flate2::read::DeflateDecoder;
 
-use std::io::Read;
 use std::ffi::CString;
-use std::path::Path;
-use std::ptr::read_unaligned;
+use std::slice;
 
 pub fn crate_type_allows_lto(crate_type: config::CrateType) -> bool {
     match crate_type {
@@ -42,12 +39,58 @@ pub fn crate_type_allows_lto(crate_type: config::CrateType) -> bool {
     }
 }
 
-pub fn run(cgcx: &CodegenContext,
-           diag_handler: &Handler,
-           llmod: ModuleRef,
-           tm: TargetMachineRef,
-           config: &ModuleConfig,
-           temp_no_opt_bc_filename: &Path) -> Result<(), FatalError> {
+pub enum LtoModuleTranslation {
+    Fat {
+        module: Option<ModuleTranslation>,
+        _serialized_bitcode: Vec<SerializedModule>,
+    },
+
+    // Note the lack of other entries in this enum! Ideally one day this gap is
+    // intended to be filled with a "Thin" LTO variant.
+}
+
+impl LtoModuleTranslation {
+    pub fn name(&self) -> &str {
+        match *self {
+            LtoModuleTranslation::Fat { .. } => "everything",
+        }
+    }
+
+    /// Optimize this module within the given codegen context.
+    ///
+    /// This function is unsafe as it'll return a `ModuleTranslation` still
+    /// points to LLVM data structures owned by this `LtoModuleTranslation`.
+    /// It's intended that the module returned is immediately code generated and
+    /// dropped, and then this LTO module is dropped.
+    pub unsafe fn optimize(&mut self, cgcx: &CodegenContext)
+        -> Result<ModuleTranslation, FatalError>
+    {
+        match *self {
+            LtoModuleTranslation::Fat { ref mut module, .. } => {
+                let trans = module.take().unwrap();
+                let config = cgcx.config(trans.kind);
+                let llmod = trans.llvm().unwrap().llmod;
+                let tm = trans.llvm().unwrap().tm;
+                run_pass_manager(cgcx, tm, llmod, config);
+                Ok(trans)
+            }
+        }
+    }
+
+    /// A "guage" of how costly it is to optimize this module, used to sort
+    /// biggest modules first.
+    pub fn cost(&self) -> u64 {
+        match *self {
+            // Only one module with fat LTO, so the cost doesn't matter.
+            LtoModuleTranslation::Fat { .. } => 0,
+        }
+    }
+}
+
+pub fn run(cgcx: &CodegenContext, modules: Vec<ModuleTranslation>)
+    -> Result<Vec<LtoModuleTranslation>, FatalError>
+{
+    let diag_handler = cgcx.create_diag_handler();
     if cgcx.opts.cg.prefer_dynamic {
         diag_handler.struct_err("cannot prefer dynamic linking when performing LTO")
                     .note("only 'staticlib', 'bin', and 'cdylib' outputs are \
@@ -82,80 +125,35 @@ pub fn run(cgcx: &CodegenContext,
         .iter()
         .filter_map(symbol_filter)
         .collect();
+    info!("{} symbols in whitelist", symbol_white_list.len());
 
     // For each of our upstream dependencies, find the corresponding rlib and
     // load the bitcode from the archive. Then merge it into the current LLVM
     // module that we've got.
+    let mut upstream_modules = Vec::new();
     for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
         symbol_white_list.extend(
             cgcx.exported_symbols[&cnum]
                 .iter()
                 .filter_map(symbol_filter));
+        info!("{} symbols in whitelist after {}", symbol_white_list.len(), cnum);
 
         let archive = ArchiveRO::open(&path).expect("wanted an rlib");
         let bytecodes = archive.iter().filter_map(|child| {
             child.ok().and_then(|c| c.name().map(|name| (name, c)))
-        }).filter(|&(name, _)| name.ends_with("bytecode.deflate"));
+        }).filter(|&(name, _)| name.ends_with(RLIB_BYTECODE_EXTENSION));
         for (name, data) in bytecodes {
+            info!("adding bytecode {}", name);
             let bc_encoded = data.data();
 
-            let bc_decoded = if is_versioned_bytecode_format(bc_encoded) {
-                time(cgcx.time_passes, &format!("decode {}", name), || {
-                    // Read the version
-                    let version = extract_bytecode_format_version(bc_encoded);
-
-                    if version == 1 {
-                        // The only version existing so far
-                        let data_size = extract_compressed_bytecode_size_v1(bc_encoded);
-                        let compressed_data = &bc_encoded[
-                            link::RLIB_BYTECODE_OBJECT_V1_DATA_OFFSET..
-                            (link::RLIB_BYTECODE_OBJECT_V1_DATA_OFFSET + data_size as usize)];
-
-                        let mut inflated = Vec::new();
-                        let res = DeflateDecoder::new(compressed_data)
-                            .read_to_end(&mut inflated);
-                        if res.is_err() {
-                            let msg = format!("failed to decompress bc of `{}`",
-                                              name);
-                            Err(diag_handler.fatal(&msg))
-                        } else {
-                            Ok(inflated)
-                        }
-                    } else {
-                        Err(diag_handler.fatal(&format!("Unsupported bytecode format version {}",
-                                                        version)))
-                    }
-                })?
-            } else {
-                time(cgcx.time_passes, &format!("decode {}", name), || {
-                    // the object must be in the old, pre-versioning format, so
-                    // simply inflate everything and let LLVM decide if it can
-                    // make sense of it
-                    let mut inflated = Vec::new();
-                    let res = DeflateDecoder::new(bc_encoded)
-                        .read_to_end(&mut inflated);
-                    if res.is_err() {
-                        let msg = format!("failed to decompress bc of `{}`",
-                                          name);
-                        Err(diag_handler.fatal(&msg))
-                    } else {
-                        Ok(inflated)
-                    }
-                })?
-            };
-
-            let ptr = bc_decoded.as_ptr();
-            debug!("linking {}", name);
-            time(cgcx.time_passes, &format!("ll link {}", name), || unsafe {
-                if llvm::LLVMRustLinkInExternalBitcode(llmod,
-                                                       ptr as *const libc::c_char,
-                                                       bc_decoded.len() as libc::size_t) {
-                    Ok(())
-                } else {
-                    let msg = format!("failed to load bc of `{}`", name);
-                    Err(write::llvm_err(&diag_handler, msg))
+            let (bc, id) = time(cgcx.time_passes, &format!("decode {}", name), || {
+                match DecodedBytecode::new(bc_encoded) {
+                    Ok(b) => Ok((b.bytecode(), b.identifier().to_string())),
+                    Err(e) => Err(diag_handler.fatal(&e)),
                 }
             })?;
+            let bc = SerializedModule::FromRlib(bc);
+            upstream_modules.push((bc, CString::new(id).unwrap()));
         }
     }
 
@@ -163,25 +161,104 @@ pub fn run(cgcx: &CodegenContext,
     let arr: Vec<*const libc::c_char> = symbol_white_list.iter()
                                                          .map(|c| c.as_ptr())
                                                          .collect();
-    let ptr = arr.as_ptr();
+
+    fat_lto(cgcx, &diag_handler, modules, upstream_modules, &arr)
+}
+
+fn fat_lto(cgcx: &CodegenContext,
+           diag_handler: &Handler,
+           mut modules: Vec<ModuleTranslation>,
+           mut serialized_modules: Vec<(SerializedModule, CString)>,
+           symbol_white_list: &[*const libc::c_char])
+    -> Result<Vec<LtoModuleTranslation>, FatalError>
+{
+    info!("going for a fat lto");
+
+    // Find the "costliest" module and merge everything into that codegen unit.
+    // All the other modules will be serialized and reparsed into the new
+    // context, so this hopefully avoids serializing and parsing the largest
+    // codegen unit.
+    //
+    // Additionally use a regular module as the base here to ensure that various
+    // file copy operations in the backend work correctly. The only other kind
+    // of module here should be an allocator one, and if your crate is smaller
+    // than the allocator module then the size doesn't really matter anyway.
+    let (_, costliest_module) = modules.iter()
+        .enumerate()
+        .filter(|&(_, module)| module.kind == ModuleKind::Regular)
+        .map(|(i, module)| {
+            let cost = unsafe {
+                llvm::LLVMRustModuleCost(module.llvm().unwrap().llmod)
+            };
+            (cost, i)
+        })
+        .max()
+        .expect("must be trans'ing at least one module");
+    let module = modules.remove(costliest_module);
+    let llmod = module.llvm().expect("can't lto pre-translated modules").llmod;
+    info!("using {:?} as a base module", module.llmod_id);
+
+    // For all other modules we translated we'll need to link them into our own
+    // bitcode. All modules were translated in their own LLVM context, however,
+    // and we want to move everything to the same LLVM context. Currently the
+    // way we know of to do that is to serialize them to a string and them parse
+    // them later. Not great but hey, that's why it's "fat" LTO, right?
+    for module in modules {
+        let llvm = module.llvm().expect("can't lto pre-translated modules");
+        let buffer = ModuleBuffer::new(llvm.llmod);
+        let llmod_id = CString::new(&module.llmod_id[..]).unwrap();
+        serialized_modules.push((SerializedModule::Local(buffer), llmod_id));
+    }
+
+    // For all serialized bitcode files we parse them and link them in as we did
+    // above, this is all mostly handled in C++. Like above, though, we don't
+    // know much about the memory management here so we err on the side of being
+    // save and persist everything with the original module.
+    let mut serialized_bitcode = Vec::new();
+    for (bc_decoded, name) in serialized_modules {
+        info!("linking {:?}", name);
+        time(cgcx.time_passes, &format!("ll link {:?}", name), || unsafe {
+            let data = bc_decoded.data();
+            if llvm::LLVMRustLinkInExternalBitcode(llmod,
+                                                   data.as_ptr() as *const libc::c_char,
+                                                   data.len() as libc::size_t) {
+                Ok(())
+            } else {
+                let msg = format!("failed to load bc of {:?}", name);
+                Err(write::llvm_err(&diag_handler, msg))
+            }
+        })?;
+        serialized_bitcode.push(bc_decoded);
+    }
+    cgcx.save_temp_bitcode(&module, "lto.input");
+
+    // Internalize everything that *isn't* in our whitelist to help strip out
+    // more modules and such
     unsafe {
+        let ptr = symbol_white_list.as_ptr();
         llvm::LLVMRustRunRestrictionPass(llmod,
                                          ptr as *const *const libc::c_char,
-                                         arr.len() as libc::size_t);
+                                         symbol_white_list.len() as libc::size_t);
+        cgcx.save_temp_bitcode(&module, "lto.after-restriction");
     }
 
     if cgcx.no_landing_pads {
         unsafe {
             llvm::LLVMRustMarkAllFunctionsNounwind(llmod);
         }
+        cgcx.save_temp_bitcode(&module, "lto.after-nounwind");
     }
 
-    if cgcx.opts.cg.save_temps {
-        let cstr = path2cstr(temp_no_opt_bc_filename);
-        unsafe {
-            llvm::LLVMWriteBitcodeToFile(llmod, cstr.as_ptr());
-        }
-    }
+    Ok(vec![LtoModuleTranslation::Fat {
+        module: Some(module),
+        _serialized_bitcode: serialized_bitcode,
+    }])
+}
+
+fn run_pass_manager(cgcx: &CodegenContext,
+                    tm: TargetMachineRef,
+                    llmod: ModuleRef,
+                    config: &ModuleConfig) {
 
     // Now we have one massive module inside of llmod. Time to run the
     // LTO-specific optimization passes that LLVM provides.
@@ -212,25 +289,45 @@ pub fn run(cgcx: &CodegenContext,
         llvm::LLVMDisposePassManager(pm);
     }
     debug!("lto done");
-    Ok(())
 }
 
-fn is_versioned_bytecode_format(bc: &[u8]) -> bool {
-    let magic_id_byte_count = link::RLIB_BYTECODE_OBJECT_MAGIC.len();
-    return bc.len() > magic_id_byte_count &&
-           &bc[..magic_id_byte_count] == link::RLIB_BYTECODE_OBJECT_MAGIC;
+pub enum SerializedModule {
+    Local(ModuleBuffer),
+    FromRlib(Vec<u8>),
 }
 
-fn extract_bytecode_format_version(bc: &[u8]) -> u32 {
-    let pos = link::RLIB_BYTECODE_OBJECT_VERSION_OFFSET;
-    let byte_data = &bc[pos..pos + 4];
-    let data = unsafe { read_unaligned(byte_data.as_ptr() as *const u32) };
-    u32::from_le(data)
+impl SerializedModule {
+    fn data(&self) -> &[u8] {
+        match *self {
+            SerializedModule::Local(ref m) => m.data(),
+            SerializedModule::FromRlib(ref m) => m,
+        }
+    }
 }
 
-fn extract_compressed_bytecode_size_v1(bc: &[u8]) -> u64 {
-    let pos = link::RLIB_BYTECODE_OBJECT_V1_DATASIZE_OFFSET;
-    let byte_data = &bc[pos..pos + 8];
-    let data = unsafe { read_unaligned(byte_data.as_ptr() as *const u64) };
-    u64::from_le(data)
+pub struct ModuleBuffer(*mut llvm::ModuleBuffer);
+
+unsafe impl Send for ModuleBuffer {}
+unsafe impl Sync for ModuleBuffer {}
+
+impl ModuleBuffer {
+    fn new(m: ModuleRef) -> ModuleBuffer {
+        ModuleBuffer(unsafe {
+            llvm::LLVMRustModuleBufferCreate(m)
+        })
+    }
+
+    fn data(&self) -> &[u8] {
+        unsafe {
+            let ptr = llvm::LLVMRustModuleBufferPtr(self.0);
+            let len = llvm::LLVMRustModuleBufferLen(self.0);
+            slice::from_raw_parts(ptr, len)
+        }
+    }
+}
+
+impl Drop for ModuleBuffer {
+    fn drop(&mut self) {
+        unsafe { llvm::LLVMRustModuleBufferFree(self.0); }
+    }
 }
diff --git a/src/librustc_trans/back/write.rs b/src/librustc_trans/back/write.rs
index 6b980a37ac7..c39bdcf25cd 100644
--- a/src/librustc_trans/back/write.rs
+++ b/src/librustc_trans/back/write.rs
@@ -22,7 +22,7 @@ use rustc::util::nodemap::FxHashMap;
 use time_graph::{self, TimeGraph};
 use llvm;
 use llvm::{ModuleRef, TargetMachineRef, PassManagerRef, DiagnosticInfoRef};
-use llvm::SMDiagnosticRef;
+use llvm::{SMDiagnosticRef, ContextRef};
 use {CrateTranslation, ModuleSource, ModuleTranslation, CompiledModule, ModuleKind};
 use CrateInfo;
 use rustc::hir::def_id::{CrateNum, LOCAL_CRATE};
@@ -41,10 +41,10 @@ use rustc_demangle;
 
 use std::any::Any;
 use std::ffi::CString;
-use std::fmt;
 use std::fs;
 use std::io;
 use std::io::Write;
+use std::mem;
 use std::path::{Path, PathBuf};
 use std::str;
 use std::sync::Arc;
@@ -143,6 +143,14 @@ fn get_llvm_opt_size(optimize: config::OptLevel) -> llvm::CodeGenOptSize {
 }
 
 pub fn create_target_machine(sess: &Session) -> TargetMachineRef {
+    target_machine_factory(sess)().unwrap_or_else(|err| {
+        panic!(llvm_err(sess.diagnostic(), err))
+    })
+}
+
+pub fn target_machine_factory(sess: &Session)
+    -> Arc<Fn() -> Result<TargetMachineRef, String> + Send + Sync>
+{
     let reloc_model = get_reloc_model(sess);
 
     let opt_level = get_llvm_opt_level(sess.opts.optimize);
@@ -171,36 +179,38 @@ pub fn create_target_machine(sess: &Session) -> TargetMachineRef {
 
     let triple = &sess.target.target.llvm_target;
 
-    let tm = unsafe {
-        let triple = CString::new(triple.as_bytes()).unwrap();
-        let cpu = match sess.opts.cg.target_cpu {
-            Some(ref s) => &**s,
-            None => &*sess.target.target.options.cpu
-        };
-        let cpu = CString::new(cpu.as_bytes()).unwrap();
-        let features = CString::new(target_feature(sess).as_bytes()).unwrap();
-        llvm::LLVMRustCreateTargetMachine(
-            triple.as_ptr(), cpu.as_ptr(), features.as_ptr(),
-            code_model,
-            reloc_model,
-            opt_level,
-            use_softfp,
-            is_pie_binary(sess),
-            ffunction_sections,
-            fdata_sections,
-        )
+    let triple = CString::new(triple.as_bytes()).unwrap();
+    let cpu = match sess.opts.cg.target_cpu {
+        Some(ref s) => &**s,
+        None => &*sess.target.target.options.cpu
     };
+    let cpu = CString::new(cpu.as_bytes()).unwrap();
+    let features = CString::new(target_feature(sess).as_bytes()).unwrap();
+    let is_pie_binary = is_pie_binary(sess);
+
+    Arc::new(move || {
+        let tm = unsafe {
+            llvm::LLVMRustCreateTargetMachine(
+                triple.as_ptr(), cpu.as_ptr(), features.as_ptr(),
+                code_model,
+                reloc_model,
+                opt_level,
+                use_softfp,
+                is_pie_binary,
+                ffunction_sections,
+                fdata_sections,
+            )
+        };
 
-    if tm.is_null() {
-        let msg = format!("Could not create LLVM TargetMachine for triple: {}",
-                          triple);
-        panic!(llvm_err(sess.diagnostic(), msg));
-    } else {
-        return tm;
-    };
+        if tm.is_null() {
+            Err(format!("Could not create LLVM TargetMachine for triple: {}",
+                        triple.to_str().unwrap()))
+        } else {
+            Ok(tm)
+        }
+    })
 }
 
-
 /// Module-specific configuration for `optimize_and_codegen`.
 pub struct ModuleConfig {
     /// Names of additional optimization passes to run.
@@ -294,6 +304,7 @@ pub struct CodegenContext {
     pub time_passes: bool,
     pub lto: bool,
     pub no_landing_pads: bool,
+    pub save_temps: bool,
     pub exported_symbols: Arc<ExportedSymbols>,
     pub opts: Arc<config::Options>,
     pub crate_types: Vec<config::CrateType>,
@@ -302,6 +313,7 @@ pub struct CodegenContext {
     regular_module_config: Arc<ModuleConfig>,
     metadata_module_config: Arc<ModuleConfig>,
     allocator_module_config: Arc<ModuleConfig>,
+    pub tm_factory: Arc<Fn() -> Result<TargetMachineRef, String> + Send + Sync>,
 
     // Handler to use for diagnostics produced during codegen.
     pub diag_emitter: SharedEmitter,
@@ -322,22 +334,62 @@ pub struct CodegenContext {
 }
 
 impl CodegenContext {
-    fn create_diag_handler(&self) -> Handler {
+    pub fn create_diag_handler(&self) -> Handler {
         Handler::with_emitter(true, false, Box::new(self.diag_emitter.clone()))
     }
 
-    fn config(&self, kind: ModuleKind) -> &ModuleConfig {
+    pub fn config(&self, kind: ModuleKind) -> &ModuleConfig {
         match kind {
             ModuleKind::Regular => &self.regular_module_config,
             ModuleKind::Metadata => &self.metadata_module_config,
             ModuleKind::Allocator => &self.allocator_module_config,
         }
     }
+
+    pub fn save_temp_bitcode(&self, trans: &ModuleTranslation, name: &str) {
+        if !self.save_temps {
+            return
+        }
+        unsafe {
+            let ext = format!("{}.bc", name);
+            let cgu = Some(&trans.name[..]);
+            let path = self.output_filenames.temp_path_ext(&ext, cgu);
+            let cstr = path2cstr(&path);
+            let llmod = trans.llvm().unwrap().llmod;
+            llvm::LLVMWriteBitcodeToFile(llmod, cstr.as_ptr());
+        }
+    }
+}
+
+struct DiagnosticHandlers<'a> {
+    inner: Box<(&'a CodegenContext, &'a Handler)>,
+    llcx: ContextRef,
+}
+
+impl<'a> DiagnosticHandlers<'a> {
+    fn new(cgcx: &'a CodegenContext,
+           handler: &'a Handler,
+           llcx: ContextRef) -> DiagnosticHandlers<'a> {
+        let data = Box::new((cgcx, handler));
+        unsafe {
+            let arg = &*data as &(_, _) as *const _ as *mut _;
+            llvm::LLVMRustSetInlineAsmDiagnosticHandler(llcx, inline_asm_handler, arg);
+            llvm::LLVMContextSetDiagnosticHandler(llcx, diagnostic_handler, arg);
+        }
+        DiagnosticHandlers {
+            inner: data,
+            llcx: llcx,
+        }
+    }
 }
 
-struct HandlerFreeVars<'a> {
-    cgcx: &'a CodegenContext,
-    diag_handler: &'a Handler,
+impl<'a> Drop for DiagnosticHandlers<'a> {
+    fn drop(&mut self) {
+        unsafe {
+            llvm::LLVMRustSetInlineAsmDiagnosticHandler(self.llcx, inline_asm_handler, 0 as *mut _);
+            llvm::LLVMContextSetDiagnosticHandler(self.llcx, diagnostic_handler, 0 as *mut _);
+        }
+    }
 }
 
 unsafe extern "C" fn report_inline_asm<'a, 'b>(cgcx: &'a CodegenContext,
@@ -349,7 +401,10 @@ unsafe extern "C" fn report_inline_asm<'a, 'b>(cgcx: &'a CodegenContext,
 unsafe extern "C" fn inline_asm_handler(diag: SMDiagnosticRef,
                                         user: *const c_void,
                                         cookie: c_uint) {
-    let HandlerFreeVars { cgcx, .. } = *(user as *const HandlerFreeVars);
+    if user.is_null() {
+        return
+    }
+    let (cgcx, _) = *(user as *const (&CodegenContext, &Handler));
 
     let msg = llvm::build_string(|s| llvm::LLVMRustWriteSMDiagnosticToString(diag, s))
         .expect("non-UTF8 SMDiagnostic");
@@ -358,7 +413,10 @@ unsafe extern "C" fn inline_asm_handler(diag: SMDiagnosticRef,
 }
 
 unsafe extern "C" fn diagnostic_handler(info: DiagnosticInfoRef, user: *mut c_void) {
-    let HandlerFreeVars { cgcx, diag_handler, .. } = *(user as *const HandlerFreeVars);
+    if user.is_null() {
+        return
+    }
+    let (cgcx, diag_handler) = *(user as *const (&CodegenContext, &Handler));
 
     match llvm::diagnostic::Diagnostic::unpack(info) {
         llvm::diagnostic::InlineAsm(inline) => {
@@ -389,28 +447,20 @@ unsafe extern "C" fn diagnostic_handler(info: DiagnosticInfoRef, user: *mut c_vo
 }
 
 // Unsafe due to LLVM calls.
-unsafe fn optimize_and_codegen(cgcx: &CodegenContext,
-                               diag_handler: &Handler,
-                               mtrans: ModuleTranslation,
-                               tm: TargetMachineRef,
-                               config: &ModuleConfig)
-    -> Result<CompiledModule, FatalError>
+unsafe fn optimize(cgcx: &CodegenContext,
+                   diag_handler: &Handler,
+                   mtrans: &ModuleTranslation,
+                   config: &ModuleConfig)
+    -> Result<(), FatalError>
 {
-    let (llmod, llcx) = match mtrans.source {
-        ModuleSource::Translated(ref llvm) => (llvm.llmod, llvm.llcx),
+    let (llmod, llcx, tm) = match mtrans.source {
+        ModuleSource::Translated(ref llvm) => (llvm.llmod, llvm.llcx, llvm.tm),
         ModuleSource::Preexisting(_) => {
             bug!("optimize_and_codegen: called with ModuleSource::Preexisting")
         }
     };
 
-    let fv = HandlerFreeVars {
-        cgcx,
-        diag_handler,
-    };
-    let fv = &fv as *const HandlerFreeVars as *mut c_void;
-
-    llvm::LLVMRustSetInlineAsmDiagnosticHandler(llcx, inline_asm_handler, fv);
-    llvm::LLVMContextSetDiagnosticHandler(llcx, diagnostic_handler, fv);
+    let _handlers = DiagnosticHandlers::new(cgcx, diag_handler, llcx);
 
     let module_name = mtrans.name.clone();
     let module_name = Some(&module_name[..]);
@@ -485,25 +535,37 @@ unsafe fn optimize_and_codegen(cgcx: &CodegenContext,
         // Deallocate managers that we're now done with
         llvm::LLVMDisposePassManager(fpm);
         llvm::LLVMDisposePassManager(mpm);
+    }
+    Ok(())
+}
 
-        if cgcx.lto {
-            time(cgcx.time_passes, "all lto passes", || {
-                let temp_no_opt_bc_filename =
-                    cgcx.output_filenames.temp_path_ext("no-opt.lto.bc", module_name);
-                lto::run(cgcx,
-                         diag_handler,
-                         llmod,
-                         tm,
-                         &config,
-                         &temp_no_opt_bc_filename)
-            })?;
-            if config.emit_lto_bc {
-                let out = cgcx.output_filenames.temp_path_ext("lto.bc", module_name);
-                let out = path2cstr(&out);
-                llvm::LLVMWriteBitcodeToFile(llmod, out.as_ptr());
-            }
+fn generate_lto_work(cgcx: &CodegenContext,
+                     modules: Vec<ModuleTranslation>)
+    -> Vec<(WorkItem, u64)>
+{
+    let lto_modules = lto::run(cgcx, modules).unwrap_or_else(|e| panic!(e));
+
+    lto_modules.into_iter().map(|module| {
+        let cost = module.cost();
+        (WorkItem::LTO(module), cost)
+    }).collect()
+}
+
+unsafe fn codegen(cgcx: &CodegenContext,
+                  diag_handler: &Handler,
+                  mtrans: ModuleTranslation,
+                  config: &ModuleConfig)
+    -> Result<CompiledModule, FatalError>
+{
+    let (llmod, llcx, tm) = match mtrans.source {
+        ModuleSource::Translated(ref llvm) => (llvm.llmod, llvm.llcx, llvm.tm),
+        ModuleSource::Preexisting(_) => {
+            bug!("codegen: called with ModuleSource::Preexisting")
         }
-    }
+    };
+    let module_name = mtrans.name.clone();
+    let module_name = Some(&module_name[..]);
+    let handlers = DiagnosticHandlers::new(cgcx, diag_handler, llcx);
 
     // A codegen-specific pass manager is used to generate object
     // files for an LLVM module.
@@ -629,7 +691,10 @@ unsafe fn optimize_and_codegen(cgcx: &CodegenContext,
         }
     }
 
-    Ok(mtrans.into_compiled_module(config.emit_obj, config.emit_bc))
+    drop(handlers);
+    Ok(mtrans.into_compiled_module(config.emit_obj,
+                                   config.emit_bc,
+                                   &cgcx.output_filenames))
 }
 
 pub struct CompiledModules {
@@ -990,37 +1055,50 @@ pub fn dump_incremental_data(trans: &CrateTranslation) {
     eprintln!("incremental: re-using {} out of {} modules", reuse, trans.modules.len());
 }
 
-struct WorkItem {
-    mtrans: ModuleTranslation,
-    tm: TargetMachine,
+enum WorkItem {
+    Optimize(ModuleTranslation),
+    LTO(lto::LtoModuleTranslation),
 }
 
-impl fmt::Debug for WorkItem {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "WorkItem({})", self.mtrans.name)
+impl WorkItem {
+    fn kind(&self) -> ModuleKind {
+        match *self {
+            WorkItem::Optimize(ref m) => m.kind,
+            WorkItem::LTO(_) => ModuleKind::Regular,
+        }
     }
-}
-
-struct TargetMachine(TargetMachineRef);
 
-unsafe impl Send for TargetMachine {}
-
-impl Drop for TargetMachine {
-    fn drop(&mut self) {
-        unsafe {
-            llvm::LLVMRustDisposeTargetMachine(self.0);
+    fn name(&self) -> String {
+        match *self {
+            WorkItem::Optimize(ref m) => format!("optimize: {}", m.name),
+            WorkItem::LTO(ref m) => format!("lto: {}", m.name()),
         }
     }
 }
 
+enum WorkItemResult {
+    Compiled(CompiledModule),
+    NeedsLTO(ModuleTranslation),
+}
+
 fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
-    -> Result<CompiledModule, FatalError>
+    -> Result<WorkItemResult, FatalError>
 {
     let diag_handler = cgcx.create_diag_handler();
-    let module_name = work_item.mtrans.name.clone();
-    let config = cgcx.config(work_item.mtrans.kind);
+    let config = cgcx.config(work_item.kind());
+    let mtrans = match work_item {
+        WorkItem::Optimize(mtrans) => mtrans,
+        WorkItem::LTO(mut lto) => {
+            unsafe {
+                let module = lto.optimize(cgcx)?;
+                let module = codegen(cgcx, &diag_handler, module, config)?;
+                return Ok(WorkItemResult::Compiled(module))
+            }
+        }
+    };
+    let module_name = mtrans.name.clone();
 
-    let pre_existing = match work_item.mtrans.source {
+    let pre_existing = match mtrans.source {
         ModuleSource::Translated(_) => None,
         ModuleSource::Preexisting(ref wp) => Some(wp.clone()),
     };
@@ -1029,13 +1107,13 @@ fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
         let incr_comp_session_dir = cgcx.incr_comp_session_dir
                                         .as_ref()
                                         .unwrap();
-        let name = &work_item.mtrans.name;
+        let name = &mtrans.name;
         for (kind, saved_file) in wp.saved_files {
             let obj_out = cgcx.output_filenames.temp_path(kind, Some(name));
             let source_file = in_incr_comp_dir(&incr_comp_session_dir,
                                                &saved_file);
             debug!("copying pre-existing module `{}` from {:?} to {}",
-                   work_item.mtrans.name,
+                   mtrans.name,
                    source_file,
                    obj_out.display());
             match link_or_copy(&source_file, &obj_out) {
@@ -1048,31 +1126,39 @@ fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
                 }
             }
         }
+        let object = cgcx.output_filenames.temp_path(OutputType::Object, Some(name));
 
-        Ok(CompiledModule {
+        Ok(WorkItemResult::Compiled(CompiledModule {
+            object,
+            llmod_id: mtrans.llmod_id.clone(),
             name: module_name,
             kind: ModuleKind::Regular,
             pre_existing: true,
-            symbol_name_hash: work_item.mtrans.symbol_name_hash,
+            symbol_name_hash: mtrans.symbol_name_hash,
             emit_bc: config.emit_bc,
             emit_obj: config.emit_obj,
-        })
+        }))
     } else {
         debug!("llvm-optimizing {:?}", module_name);
 
         unsafe {
-            optimize_and_codegen(cgcx,
-                                 &diag_handler,
-                                 work_item.mtrans,
-                                 work_item.tm.0,
-                                 config)
+            optimize(cgcx, &diag_handler, &mtrans, config)?;
+            if !cgcx.lto || mtrans.kind == ModuleKind::Metadata {
+                let module = codegen(cgcx, &diag_handler, mtrans, config)?;
+                Ok(WorkItemResult::Compiled(module))
+            } else {
+                Ok(WorkItemResult::NeedsLTO(mtrans))
+            }
         }
     }
 }
 
-#[derive(Debug)]
 enum Message {
     Token(io::Result<Acquired>),
+    NeedsLTO {
+        result: ModuleTranslation,
+        worker_id: usize,
+    },
     Done {
         result: Result<CompiledModule, ()>,
         worker_id: usize,
@@ -1146,6 +1232,7 @@ fn start_executing_work(tcx: TyCtxt,
         each_linked_rlib_for_lto,
         lto: sess.lto(),
         no_landing_pads: sess.no_landing_pads(),
+        save_temps: sess.opts.cg.save_temps,
         opts: Arc::new(sess.opts.clone()),
         time_passes: sess.time_passes(),
         exported_symbols,
@@ -1160,6 +1247,7 @@ fn start_executing_work(tcx: TyCtxt,
         regular_module_config: modules_config,
         metadata_module_config: metadata_config,
         allocator_module_config: allocator_config,
+        tm_factory: target_machine_factory(tcx.sess),
     };
 
     // This is the "main loop" of parallel work happening for parallel codegen.
@@ -1282,6 +1370,21 @@ fn start_executing_work(tcx: TyCtxt,
     // and whenever we're done with that work we release the semaphore. In this
     // manner we can ensure that the maximum number of parallel workers is
     // capped at any one point in time.
+    //
+    // LTO and the coordinator thread
+    // ------------------------------
+    //
+    // The final job the coordinator thread is responsible for is managing LTO
+    // and how that works. When LTO is requested what we'll to is collect all
+    // optimized LLVM modules into a local vector on the coordinator. Once all
+    // modules have been translated and optimized we hand this to the `lto`
+    // module for further optimization. The `lto` module will return back a list
+    // of more modules to work on, which the coordinator will continue to spawn
+    // work for.
+    //
+    // Each LLVM module is automatically sent back to the coordinator for LTO if
+    // necessary. There's already optimizations in place to avoid sending work
+    // back to the coordinator if LTO isn't requested.
     return thread::spawn(move || {
         // We pretend to be within the top-level LLVM time-passes task here:
         set_time_depth(1);
@@ -1304,6 +1407,8 @@ fn start_executing_work(tcx: TyCtxt,
         let mut compiled_modules = vec![];
         let mut compiled_metadata_module = None;
         let mut compiled_allocator_module = None;
+        let mut needs_lto = Vec::new();
+        let mut started_lto = false;
 
         // This flag tracks whether all items have gone through translations
         let mut translation_done = false;
@@ -1325,6 +1430,7 @@ fn start_executing_work(tcx: TyCtxt,
         while !translation_done ||
               work_items.len() > 0 ||
               running > 0 ||
+              needs_lto.len() > 0 ||
               main_thread_worker_state != MainThreadWorkerState::Idle {
 
             // While there are still CGUs to be translated, the coordinator has
@@ -1348,13 +1454,34 @@ fn start_executing_work(tcx: TyCtxt,
                             worker: get_worker_id(&mut free_worker_ids),
                             .. cgcx.clone()
                         };
-                        maybe_start_llvm_timer(cgcx.config(item.mtrans.kind),
+                        maybe_start_llvm_timer(cgcx.config(item.kind()),
                                                &mut llvm_start_time);
                         main_thread_worker_state = MainThreadWorkerState::LLVMing;
                         spawn_work(cgcx, item);
                     }
                 }
             } else {
+                // If we've finished everything related to normal translation
+                // then it must be the case that we've got some LTO work to do.
+                // Perform the serial work here of figuring out what we're
+                // going to LTO and then push a bunch of work items onto our
+                // queue to do LTO
+                if work_items.len() == 0 &&
+                   running == 0 &&
+                   main_thread_worker_state == MainThreadWorkerState::Idle {
+                    assert!(!started_lto);
+                    assert!(needs_lto.len() > 0);
+                    started_lto = true;
+                    let modules = mem::replace(&mut needs_lto, Vec::new());
+                    for (work, cost) in generate_lto_work(&cgcx, modules) {
+                        let insertion_index = work_items
+                            .binary_search_by_key(&cost, |&(_, cost)| cost)
+                            .unwrap_or_else(|e| e);
+                        work_items.insert(insertion_index, (work, cost));
+                        helper.request_token();
+                    }
+                }
+
                 // In this branch, we know that everything has been translated,
                 // so it's just a matter of determining whether the implicit
                 // Token is free to use for LLVM work.
@@ -1365,7 +1492,7 @@ fn start_executing_work(tcx: TyCtxt,
                                 worker: get_worker_id(&mut free_worker_ids),
                                 .. cgcx.clone()
                             };
-                            maybe_start_llvm_timer(cgcx.config(item.mtrans.kind),
+                            maybe_start_llvm_timer(cgcx.config(item.kind()),
                                                    &mut llvm_start_time);
                             main_thread_worker_state = MainThreadWorkerState::LLVMing;
                             spawn_work(cgcx, item);
@@ -1396,7 +1523,7 @@ fn start_executing_work(tcx: TyCtxt,
             while work_items.len() > 0 && running < tokens.len() {
                 let (item, _) = work_items.pop().unwrap();
 
-                maybe_start_llvm_timer(cgcx.config(item.mtrans.kind),
+                maybe_start_llvm_timer(cgcx.config(item.kind()),
                                        &mut llvm_start_time);
 
                 let cgcx = CodegenContext {
@@ -1499,6 +1626,17 @@ fn start_executing_work(tcx: TyCtxt,
                         }
                     }
                 }
+                Message::NeedsLTO { result, worker_id } => {
+                    assert!(!started_lto);
+                    if main_thread_worker_state == MainThreadWorkerState::LLVMing {
+                        main_thread_worker_state = MainThreadWorkerState::Idle;
+                    } else {
+                        running -= 1;
+                    }
+
+                    free_worker_ids.push(worker_id);
+                    needs_lto.push(result);
+                }
                 Message::Done { result: Err(()), worker_id: _ } => {
                     shared_emitter.fatal("aborting due to worker thread panic");
                     // Exit the coordinator thread
@@ -1575,20 +1713,22 @@ fn spawn_work(cgcx: CodegenContext, work: WorkItem) {
         // we exit.
         struct Bomb {
             coordinator_send: Sender<Box<Any + Send>>,
-            result: Option<CompiledModule>,
+            result: Option<WorkItemResult>,
             worker_id: usize,
         }
         impl Drop for Bomb {
             fn drop(&mut self) {
-                let result = match self.result.take() {
-                    Some(compiled_module) => Ok(compiled_module),
-                    None => Err(())
+                let worker_id = self.worker_id;
+                let msg = match self.result.take() {
+                    Some(WorkItemResult::Compiled(m)) => {
+                        Message::Done { result: Ok(m), worker_id }
+                    }
+                    Some(WorkItemResult::NeedsLTO(m)) => {
+                        Message::NeedsLTO { result: m, worker_id }
+                    }
+                    None => Message::Done { result: Err(()), worker_id }
                 };
-
-                drop(self.coordinator_send.send(Box::new(Message::Done {
-                    result,
-                    worker_id: self.worker_id,
-                })));
+                drop(self.coordinator_send.send(Box::new(msg)));
             }
         }
 
@@ -1612,10 +1752,11 @@ fn spawn_work(cgcx: CodegenContext, work: WorkItem) {
         // we're done, which if `execute_work_item` failed is unlikely to be
         // seen by the main thread, but hey we might as well try anyway.
         bomb.result = {
-            let _timing_guard = cgcx.time_graph
-                                .as_ref()
-                                .map(|tg| tg.start(time_graph::TimelineId(cgcx.worker),
-                                                   LLVM_WORK_PACKAGE_KIND));
+            let _timing_guard = cgcx.time_graph.as_ref().map(|tg| {
+                tg.start(time_graph::TimelineId(cgcx.worker),
+                         LLVM_WORK_PACKAGE_KIND,
+                         &work.name())
+            });
             Some(execute_work_item(&cgcx, work).unwrap())
         };
     });
@@ -1923,9 +2064,7 @@ impl OngoingCrateTranslation {
             Ok(Message::TranslateItem) => {
                 // Nothing to do
             }
-            Ok(message) => {
-                panic!("unexpected message: {:?}", message)
-            }
+            Ok(_) => panic!("unexpected message"),
             Err(_) => {
                 // One of the LLVM threads must have panicked, fall through so
                 // error handling can be reached.
@@ -1937,10 +2076,7 @@ impl OngoingCrateTranslation {
 pub fn submit_translated_module_to_llvm(tcx: TyCtxt,
                                         mtrans: ModuleTranslation,
                                         cost: u64) {
-    let llvm_work_item = WorkItem {
-        mtrans,
-        tm: TargetMachine(create_target_machine(tcx.sess)),
-    };
+    let llvm_work_item = WorkItem::Optimize(mtrans);
     drop(tcx.tx_to_llvm_workers.send(Box::new(Message::TranslationDone {
         llvm_work_item,
         cost,
diff --git a/src/librustc_trans/base.rs b/src/librustc_trans/base.rs
index 774acc81343..92119e411cd 100644
--- a/src/librustc_trans/base.rs
+++ b/src/librustc_trans/base.rs
@@ -31,7 +31,7 @@ use super::ModuleKind;
 use assert_module_sources::{self, Disposition};
 use back::link;
 use back::symbol_export;
-use back::write::{self, OngoingCrateTranslation};
+use back::write::{self, OngoingCrateTranslation, create_target_machine};
 use llvm::{ContextRef, ModuleRef, ValueRef, Vector, get_param};
 use llvm;
 use metadata;
@@ -732,6 +732,7 @@ fn contains_null(s: &str) -> bool {
 }
 
 fn write_metadata<'a, 'gcx>(tcx: TyCtxt<'a, 'gcx, 'gcx>,
+                            llmod_id: &str,
                             link_meta: &LinkMeta,
                             exported_symbols: &NodeSet)
                             -> (ContextRef, ModuleRef,
@@ -741,7 +742,7 @@ fn write_metadata<'a, 'gcx>(tcx: TyCtxt<'a, 'gcx, 'gcx>,
     use flate2::write::DeflateEncoder;
 
     let (metadata_llcx, metadata_llmod) = unsafe {
-        context::create_context_and_module(tcx.sess, "metadata")
+        context::create_context_and_module(tcx.sess, llmod_id)
     };
 
     #[derive(PartialEq, Eq, PartialOrd, Ord)]
@@ -886,17 +887,20 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
 
     let shared_ccx = SharedCrateContext::new(tcx);
     // Translate the metadata.
+    let llmod_id = "metadata";
     let (metadata_llcx, metadata_llmod, metadata, metadata_incr_hashes) =
         time(tcx.sess.time_passes(), "write metadata", || {
-            write_metadata(tcx, &link_meta, &exported_symbol_node_ids)
+            write_metadata(tcx, llmod_id, &link_meta, &exported_symbol_node_ids)
         });
 
     let metadata_module = ModuleTranslation {
         name: link::METADATA_MODULE_NAME.to_string(),
+        llmod_id: llmod_id.to_string(),
         symbol_name_hash: 0, // we always rebuild metadata, at least for now
         source: ModuleSource::Translated(ModuleLlvm {
             llcx: metadata_llcx,
             llmod: metadata_llmod,
+            tm: create_target_machine(tcx.sess),
         }),
         kind: ModuleKind::Metadata,
     };
@@ -935,8 +939,6 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         shared_ccx.tcx().collect_and_partition_translation_items(LOCAL_CRATE).1;
     let codegen_units = (*codegen_units).clone();
 
-    assert!(codegen_units.len() <= 1 || !tcx.sess.lto());
-
     let ongoing_translation = write::start_async_translation(
         tcx,
         time_graph.clone(),
@@ -945,24 +947,15 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         rx);
 
     // Translate an allocator shim, if any
-    //
-    // If LTO is enabled and we've got some previous LLVM module we translated
-    // above, then we can just translate directly into that LLVM module. If not,
-    // however, we need to create a separate module and trans into that. Note
-    // that the separate translation is critical for the standard library where
-    // the rlib's object file doesn't have allocator functions but the dylib
-    // links in an object file that has allocator functions. When we're
-    // compiling a final LTO artifact, though, there's no need to worry about
-    // this as we're not working with this dual "rlib/dylib" functionality.
-    let allocator_module = if tcx.sess.lto() {
-        None
-    } else if let Some(kind) = tcx.sess.allocator_kind.get() {
+    let allocator_module = if let Some(kind) = tcx.sess.allocator_kind.get() {
         unsafe {
+            let llmod_id = "allocator";
             let (llcx, llmod) =
-                context::create_context_and_module(tcx.sess, "allocator");
+                context::create_context_and_module(tcx.sess, llmod_id);
             let modules = ModuleLlvm {
                 llmod,
                 llcx,
+                tm: create_target_machine(tcx.sess),
             };
             time(tcx.sess.time_passes(), "write allocator module", || {
                 allocator::trans(tcx, &modules, kind)
@@ -970,6 +963,7 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
 
             Some(ModuleTranslation {
                 name: link::ALLOCATOR_MODULE_NAME.to_string(),
+                llmod_id: llmod_id.to_string(),
                 symbol_name_hash: 0, // we always rebuild allocator shims
                 source: ModuleSource::Translated(modules),
                 kind: ModuleKind::Allocator,
@@ -1002,10 +996,11 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         ongoing_translation.wait_for_signal_to_translate_item();
         ongoing_translation.check_for_errors(tcx.sess);
 
-        let _timing_guard = time_graph
-            .as_ref()
-            .map(|time_graph| time_graph.start(write::TRANS_WORKER_TIMELINE,
-                                               write::TRANS_WORK_PACKAGE_KIND));
+        let _timing_guard = time_graph.as_ref().map(|time_graph| {
+            time_graph.start(write::TRANS_WORKER_TIMELINE,
+                             write::TRANS_WORK_PACKAGE_KIND,
+                             &format!("codegen {}", cgu.name()))
+        });
         let start_time = Instant::now();
         all_stats.extend(tcx.compile_codegen_unit(*cgu.name()));
         total_trans_time += start_time.elapsed();
@@ -1336,6 +1331,16 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         let cgu_id = cgu.work_product_id();
         let symbol_name_hash = cgu.compute_symbol_name_hash(tcx);
 
+        // Append ".rs" to LLVM module identifier.
+        //
+        // LLVM code generator emits a ".file filename" directive
+        // for ELF backends. Value of the "filename" is set as the
+        // LLVM module identifier.  Due to a LLVM MC bug[1], LLVM
+        // crashes if the module identifier is same as other symbols
+        // such as a function name in the module.
+        // 1. http://llvm.org/bugs/show_bug.cgi?id=11479
+        let llmod_id = format!("{}.rs", cgu.name());
+
         // Check whether there is a previous work-product we can
         // re-use.  Not only must the file exist, and the inputs not
         // be dirty, but the hash of the symbols we will generate must
@@ -1361,6 +1366,7 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         if let Some(buf) = previous_work_product {
             // Don't need to translate this module.
             let module = ModuleTranslation {
+                llmod_id: llmod_id,
                 name: cgu_name,
                 symbol_name_hash,
                 source: ModuleSource::Preexisting(buf.clone()),
@@ -1371,7 +1377,7 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
 
         // Instantiate translation items without filling out definitions yet...
         let scx = SharedCrateContext::new(tcx);
-        let lcx = LocalCrateContext::new(&scx, cgu);
+        let lcx = LocalCrateContext::new(&scx, cgu, &llmod_id);
         let module = {
             let ccx = CrateContext::new(&scx, &lcx);
             let trans_items = ccx.codegen_unit()
@@ -1423,20 +1429,9 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
             let llvm_module = ModuleLlvm {
                 llcx: ccx.llcx(),
                 llmod: ccx.llmod(),
+                tm: create_target_machine(ccx.sess()),
             };
 
-            // In LTO mode we inject the allocator shim into the existing
-            // module.
-            if ccx.sess().lto() {
-                if let Some(kind) = ccx.sess().allocator_kind.get() {
-                    time(ccx.sess().time_passes(), "write allocator module", || {
-                        unsafe {
-                            allocator::trans(ccx.tcx(), &llvm_module, kind);
-                        }
-                    });
-                }
-            }
-
             // Adjust exported symbols for MSVC dllimport
             if ccx.sess().target.target.options.is_like_msvc &&
                ccx.sess().crate_types.borrow().iter().any(|ct| *ct == config::CrateTypeRlib) {
@@ -1448,6 +1443,7 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
                 symbol_name_hash,
                 source: ModuleSource::Translated(llvm_module),
                 kind: ModuleKind::Regular,
+                llmod_id,
             }
         };
 
diff --git a/src/librustc_trans/context.rs b/src/librustc_trans/context.rs
index b394911c923..1722d008a54 100644
--- a/src/librustc_trans/context.rs
+++ b/src/librustc_trans/context.rs
@@ -320,19 +320,10 @@ impl<'b, 'tcx> SharedCrateContext<'b, 'tcx> {
 
 impl<'a, 'tcx> LocalCrateContext<'a, 'tcx> {
     pub fn new(shared: &SharedCrateContext<'a, 'tcx>,
-               codegen_unit: Arc<CodegenUnit<'tcx>>)
+               codegen_unit: Arc<CodegenUnit<'tcx>>,
+               llmod_id: &str)
                -> LocalCrateContext<'a, 'tcx> {
         unsafe {
-            // Append ".rs" to LLVM module identifier.
-            //
-            // LLVM code generator emits a ".file filename" directive
-            // for ELF backends. Value of the "filename" is set as the
-            // LLVM module identifier.  Due to a LLVM MC bug[1], LLVM
-            // crashes if the module identifier is same as other symbols
-            // such as a function name in the module.
-            // 1. http://llvm.org/bugs/show_bug.cgi?id=11479
-            let llmod_id = format!("{}.rs", codegen_unit.name());
-
             let (llcx, llmod) = create_context_and_module(&shared.tcx.sess,
                                                           &llmod_id[..]);
 
diff --git a/src/librustc_trans/lib.rs b/src/librustc_trans/lib.rs
index 796dfd4417c..c38b90dcf4f 100644
--- a/src/librustc_trans/lib.rs
+++ b/src/librustc_trans/lib.rs
@@ -68,17 +68,26 @@ pub use base::trans_crate;
 pub use metadata::LlvmMetadataLoader;
 pub use llvm_util::{init, target_features, print_version, print_passes, print, enable_llvm_debug};
 
+use std::any::Any;
+use std::path::PathBuf;
 use std::rc::Rc;
+use std::sync::mpsc;
 
+use rustc::dep_graph::DepGraph;
 use rustc::hir::def_id::CrateNum;
+use rustc::middle::cstore::MetadataLoader;
 use rustc::middle::cstore::{NativeLibrary, CrateSource, LibSource};
+use rustc::session::Session;
+use rustc::session::config::{OutputFilenames, OutputType};
 use rustc::ty::maps::Providers;
+use rustc::ty::{self, TyCtxt};
 use rustc::util::nodemap::{FxHashSet, FxHashMap};
 
 mod diagnostics;
 
 pub mod back {
     mod archive;
+    mod bytecode;
     mod command;
     pub(crate) mod linker;
     pub mod link;
@@ -138,14 +147,6 @@ mod type_;
 mod type_of;
 mod value;
 
-use std::sync::mpsc;
-use std::any::Any;
-use rustc::ty::{self, TyCtxt};
-use rustc::session::Session;
-use rustc::session::config::OutputFilenames;
-use rustc::middle::cstore::MetadataLoader;
-use rustc::dep_graph::DepGraph;
-
 pub struct LlvmTransCrate(());
 
 impl LlvmTransCrate {
@@ -202,12 +203,13 @@ pub struct ModuleTranslation {
     /// something unique to this crate (e.g., a module path) as well
     /// as the crate name and disambiguator.
     name: String,
+    llmod_id: String,
     symbol_name_hash: u64,
     pub source: ModuleSource,
     pub kind: ModuleKind,
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, PartialEq)]
 pub enum ModuleKind {
     Regular,
     Metadata,
@@ -215,35 +217,32 @@ pub enum ModuleKind {
 }
 
 impl ModuleTranslation {
-    pub fn into_compiled_module(self, emit_obj: bool, emit_bc: bool) -> CompiledModule {
+    pub fn llvm(&self) -> Option<&ModuleLlvm> {
+        match self.source {
+            ModuleSource::Translated(ref llvm) => Some(llvm),
+            ModuleSource::Preexisting(_) => None,
+        }
+    }
+
+    pub fn into_compiled_module(self,
+                                emit_obj: bool,
+                                emit_bc: bool,
+                                outputs: &OutputFilenames) -> CompiledModule {
         let pre_existing = match self.source {
             ModuleSource::Preexisting(_) => true,
             ModuleSource::Translated(_) => false,
         };
+        let object = outputs.temp_path(OutputType::Object, Some(&self.name));
 
         CompiledModule {
+            llmod_id: self.llmod_id,
             name: self.name.clone(),
             kind: self.kind,
             symbol_name_hash: self.symbol_name_hash,
             pre_existing,
             emit_obj,
             emit_bc,
-        }
-    }
-}
-
-impl Drop for ModuleTranslation {
-    fn drop(&mut self) {
-        match self.source {
-            ModuleSource::Preexisting(_) => {
-                // Nothing to dispose.
-            },
-            ModuleSource::Translated(llvm) => {
-                unsafe {
-                    llvm::LLVMDisposeModule(llvm.llmod);
-                    llvm::LLVMContextDispose(llvm.llcx);
-                }
-            },
+            object,
         }
     }
 }
@@ -251,6 +250,8 @@ impl Drop for ModuleTranslation {
 #[derive(Debug)]
 pub struct CompiledModule {
     pub name: String,
+    pub llmod_id: String,
+    pub object: PathBuf,
     pub kind: ModuleKind,
     pub symbol_name_hash: u64,
     pub pre_existing: bool,
@@ -258,7 +259,6 @@ pub struct CompiledModule {
     pub emit_bc: bool,
 }
 
-#[derive(Clone)]
 pub enum ModuleSource {
     /// Copy the `.o` files or whatever from the incr. comp. directory.
     Preexisting(WorkProduct),
@@ -267,14 +267,25 @@ pub enum ModuleSource {
     Translated(ModuleLlvm),
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Debug)]
 pub struct ModuleLlvm {
     llcx: llvm::ContextRef,
     pub llmod: llvm::ModuleRef,
+    tm: llvm::TargetMachineRef,
 }
 
-unsafe impl Send for ModuleTranslation { }
-unsafe impl Sync for ModuleTranslation { }
+unsafe impl Send for ModuleLlvm { }
+unsafe impl Sync for ModuleLlvm { }
+
+impl Drop for ModuleLlvm {
+    fn drop(&mut self) {
+        unsafe {
+            llvm::LLVMDisposeModule(self.llmod);
+            llvm::LLVMContextDispose(self.llcx);
+            llvm::LLVMRustDisposeTargetMachine(self.tm);
+        }
+    }
+}
 
 pub struct CrateTranslation {
     pub crate_name: Symbol,
diff --git a/src/librustc_trans/time_graph.rs b/src/librustc_trans/time_graph.rs
index ead6e432561..ec57af888e5 100644
--- a/src/librustc_trans/time_graph.rs
+++ b/src/librustc_trans/time_graph.rs
@@ -16,14 +16,15 @@ use std::io::prelude::*;
 use std::fs::File;
 
 const OUTPUT_WIDTH_IN_PX: u64 = 1000;
-const TIME_LINE_HEIGHT_IN_PX: u64 = 7;
-const TIME_LINE_HEIGHT_STRIDE_IN_PX: usize = 10;
+const TIME_LINE_HEIGHT_IN_PX: u64 = 20;
+const TIME_LINE_HEIGHT_STRIDE_IN_PX: usize = 30;
 
 #[derive(Clone)]
 struct Timing {
     start: Instant,
     end: Instant,
     work_package_kind: WorkPackageKind,
+    name: String,
 }
 
 #[derive(Clone, Copy, Hash, Eq, PartialEq, Debug)]
@@ -32,7 +33,7 @@ pub struct TimelineId(pub usize);
 #[derive(Clone)]
 struct PerThread {
     timings: Vec<Timing>,
-    open_work_package: Option<(Instant, WorkPackageKind)>,
+    open_work_package: Option<(Instant, WorkPackageKind, String)>,
 }
 
 #[derive(Clone)]
@@ -66,7 +67,8 @@ impl TimeGraph {
 
     pub fn start(&self,
                  timeline: TimelineId,
-                 work_package_kind: WorkPackageKind) -> RaiiToken {
+                 work_package_kind: WorkPackageKind,
+                 name: &str) -> RaiiToken {
         {
             let mut table = self.data.lock().unwrap();
 
@@ -76,7 +78,7 @@ impl TimeGraph {
             });
 
             assert!(data.open_work_package.is_none());
-            data.open_work_package = Some((Instant::now(), work_package_kind));
+            data.open_work_package = Some((Instant::now(), work_package_kind, name.to_string()));
         }
 
         RaiiToken {
@@ -92,17 +94,16 @@ impl TimeGraph {
         let mut table = self.data.lock().unwrap();
         let data = table.get_mut(&timeline).unwrap();
 
-        if let Some((start, work_package_kind)) = data.open_work_package {
+        if let Some((start, work_package_kind, name)) = data.open_work_package.take() {
             data.timings.push(Timing {
                 start,
                 end,
                 work_package_kind,
+                name,
             });
         } else {
             bug!("end timing without start?")
         }
-
-        data.open_work_package = None;
     }
 
     pub fn dump(&self, output_filename: &str) {
@@ -148,16 +149,18 @@ impl TimeGraph {
                 let colors = span.work_package_kind.0;
 
                 writeln!(file, "<div style='position:absolute; \
+                                            overflow:hidden; \
                                             top:{}px; \
                                             left:{}px; \
                                             width:{}px; \
                                             height:{}px; \
-                                            background:{};'></div>",
+                                            background:{};'>{}</div>",
                     line_top,
                     start,
                     end - start,
                     TIME_LINE_HEIGHT_IN_PX,
-                    colors[color % colors.len()]
+                    colors[color % colors.len()],
+                    span.name,
                     ).unwrap();
 
                 color += 1;
diff --git a/src/rustllvm/RustWrapper.cpp b/src/rustllvm/RustWrapper.cpp
index 15a04ba00e2..bc616f64881 100644
--- a/src/rustllvm/RustWrapper.cpp
+++ b/src/rustllvm/RustWrapper.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 
 #include "llvm/IR/CallSite.h"
 
@@ -891,6 +892,23 @@ extern "C" bool LLVMRustLinkInExternalBitcode(LLVMModuleRef DstRef, char *BC,
   return true;
 }
 
+extern "C" bool LLVMRustLinkInParsedExternalBitcode(
+    LLVMModuleRef DstRef, LLVMModuleRef SrcRef) {
+#if LLVM_VERSION_GE(4, 0)
+  Module *Dst = unwrap(DstRef);
+  std::unique_ptr<Module> Src(unwrap(SrcRef));
+
+  if (Linker::linkModules(*Dst, std::move(Src))) {
+    LLVMRustSetLastError("failed to link modules");
+    return false;
+  }
+  return true;
+#else
+  LLVMRustSetLastError("can't link parsed modules on this LLVM");
+  return false;
+#endif
+}
+
 // Note that the two following functions look quite similar to the
 // LLVMGetSectionName function. Sadly, it appears that this function only
 // returns a char* pointer, which isn't guaranteed to be null-terminated. The
@@ -1403,3 +1421,47 @@ extern "C" void LLVMRustSetVisibility(LLVMValueRef V,
                                       LLVMRustVisibility RustVisibility) {
   LLVMSetVisibility(V, fromRust(RustVisibility));
 }
+
+struct LLVMRustModuleBuffer {
+  std::string data;
+};
+
+extern "C" LLVMRustModuleBuffer*
+LLVMRustModuleBufferCreate(LLVMModuleRef M) {
+  auto Ret = llvm::make_unique<LLVMRustModuleBuffer>();
+  {
+    raw_string_ostream OS(Ret->data);
+    {
+      legacy::PassManager PM;
+      PM.add(createBitcodeWriterPass(OS));
+      PM.run(*unwrap(M));
+    }
+  }
+  return Ret.release();
+}
+
+extern "C" void
+LLVMRustModuleBufferFree(LLVMRustModuleBuffer *Buffer) {
+  delete Buffer;
+}
+
+extern "C" const void*
+LLVMRustModuleBufferPtr(const LLVMRustModuleBuffer *Buffer) {
+  return Buffer->data.data();
+}
+
+extern "C" size_t
+LLVMRustModuleBufferLen(const LLVMRustModuleBuffer *Buffer) {
+  return Buffer->data.length();
+}
+
+extern "C" uint64_t
+LLVMRustModuleCost(LLVMModuleRef M) {
+  Module &Mod = *unwrap(M);
+  uint64_t cost = 0;
+  for (auto &F : Mod.functions()) {
+    (void)F;
+    cost += 1;
+  }
+  return cost;
+}
diff --git a/src/test/run-pass/lto-many-codegen-units.rs b/src/test/run-pass/lto-many-codegen-units.rs
new file mode 100644
index 00000000000..bed675cee56
--- /dev/null
+++ b/src/test/run-pass/lto-many-codegen-units.rs
@@ -0,0 +1,15 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -C lto -C codegen-units=8
+// no-prefer-dynamic
+
+fn main() {
+}