about summary refs log tree commit diff
diff options
context:
space:
mode:
authorantoyo <antoyo@users.noreply.github.com>2023-08-30 21:15:39 -0400
committerGitHub <noreply@github.com>2023-08-30 21:15:39 -0400
commit4e41a8a632552de954c2487f238335521cc73185 (patch)
treef761729cedf0ddb962acfc5976e158513c30b818
parent2b956f535e03e2ecaab3f078037ad006419f6c91 (diff)
parent62867dc29f8772db166139d954dbe606ab28c34b (diff)
downloadrust-4e41a8a632552de954c2487f238335521cc73185.tar.gz
rust-4e41a8a632552de954c2487f238335521cc73185.zip
Merge pull request #278 from rust-lang/feature/lto_2023-05-12
Add support for Link-Time Optimization
-rw-r--r--.github/workflows/ci.yml12
-rw-r--r--.github/workflows/release.yml22
-rw-r--r--.github/workflows/stdarch.yml14
-rw-r--r--Cargo.lock10
-rw-r--r--Cargo.toml4
-rw-r--r--Readme.md47
-rw-r--r--build_sysroot/Cargo.toml1
-rw-r--r--config.sh13
-rw-r--r--failing-lto-tests.txt23
-rw-r--r--failing-non-lto-tests.txt11
-rw-r--r--failing-ui-tests.txt19
-rw-r--r--locales/en-US.ftl65
-rw-r--r--messages.ftl14
-rw-r--r--patches/0001-Add-stdarch-Cargo.toml-for-testing.patch19
-rw-r--r--patches/0001-Disable-examples.patch25
-rw-r--r--src/back/lto.rs341
-rw-r--r--src/back/mod.rs1
-rw-r--r--src/back/write.rs102
-rw-r--r--src/base.rs15
-rw-r--r--src/declare.rs40
-rw-r--r--src/errors.rs31
-rw-r--r--src/lib.rs52
-rwxr-xr-xtest.sh6
23 files changed, 713 insertions, 174 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 419468209ef..4702494f05c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,8 +57,8 @@ jobs:
       uses: dawidd6/action-download-artifact@v2
       with:
           workflow: main.yml
-          name: ${{ matrix.libgccjit_version.gcc }}
-          path: gcc-build
+          name: gcc-13
+          path: gcc-13
           repo: antoyo/gcc
           branch: ${{ matrix.libgccjit_version.artifacts_branch }}
           event: push
@@ -71,9 +71,8 @@ jobs:
     - name: Setup path to libgccjit
       if: matrix.libgccjit_version.gcc != 'libgccjit12.so'
       run: |
-          echo $(readlink -f gcc-build) > gcc_path
-          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
-          ln gcc-build/libgccjit.so gcc-build/libgccjit.so.0
+          sudo dpkg --force-overwrite -i gcc-13/gcc-13.deb
+          echo /usr/lib/ > gcc_path
 
     - name: Set env
       run: |
@@ -141,6 +140,9 @@ jobs:
       if: ${{ matrix.libgccjit_version.gcc == 'libgccjit12.so' }}
       run: cat failing-ui-tests12.txt >> failing-ui-tests.txt
 
+    - name: Add more failing tests because the sysroot is not compiled with LTO
+      run: cat failing-non-lto-tests.txt >> failing-ui-tests.txt
+
     - name: Run tests
       run: |
         ${{ matrix.libgccjit_version.env_extra }} ./test.sh --release --clean --build-sysroot ${{ matrix.commands }} ${{ matrix.libgccjit_version.extra }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 655a94cbafd..51fc5c76cdb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -18,8 +18,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        libgccjit_version:
-          - { gcc: "libgccjit.so", artifacts_branch: "master" }
         commands: [
           "--test-successful-rustc --nb-parts 2 --current-part 0",
           "--test-successful-rustc --nb-parts 2 --current-part 1",
@@ -40,18 +38,17 @@ jobs:
       uses: dawidd6/action-download-artifact@v2
       with:
           workflow: main.yml
-          name: ${{ matrix.libgccjit_version.gcc }}
-          path: gcc-build
+          name: gcc-13
+          path: gcc-13
           repo: antoyo/gcc
-          branch: ${{ matrix.libgccjit_version.artifacts_branch }}
+          branch: "master"
           event: push
           search_artifacts: true # Because, instead, the action only check the last job ran and that won't work since we want multiple artifacts.
 
     - name: Setup path to libgccjit
       run: |
-          echo $(readlink -f gcc-build) > gcc_path
-          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
-          ln gcc-build/libgccjit.so gcc-build/libgccjit.so.0
+          sudo dpkg --force-overwrite -i gcc-13/gcc-13.deb
+          echo /usr/lib/ > gcc_path
 
     - name: Set env
       run: |
@@ -89,7 +86,7 @@ jobs:
     - name: Build
       run: |
         ./y.sh prepare --only-libcore
-        ./build.sh --release --release-sysroot
+        EMBED_LTO_BITCODE=1 ./build.sh --release --release-sysroot
         cargo test
         ./clean_all.sh
 
@@ -98,6 +95,8 @@ jobs:
         git config --global user.email "user@example.com"
         git config --global user.name "User"
         ./y.sh prepare
+        # FIXME(antoyo): we cannot enable LTO for stdarch tests currently because of some failing LTO tests using proc-macros.
+        echo -n 'lto = "fat"' >> build_sysroot/Cargo.toml
 
     # Compile is a separate step, as the actions-rs/cargo action supports error annotations
     - name: Compile
@@ -106,6 +105,9 @@ jobs:
         command: build
         args: --release
 
+    - name: Add more failing tests because of undefined symbol errors (FIXME)
+      run: cat failing-lto-tests.txt >> failing-ui-tests.txt
+
     - name: Run tests
       run: |
-        ./test.sh --release --clean --release-sysroot --build-sysroot ${{ matrix.commands }}
+        EMBED_LTO_BITCODE=1 ./test.sh --release --clean --release-sysroot --build-sysroot ${{ matrix.commands }}
diff --git a/.github/workflows/stdarch.yml b/.github/workflows/stdarch.yml
index 193c77f33c4..c44d8efe3c7 100644
--- a/.github/workflows/stdarch.yml
+++ b/.github/workflows/stdarch.yml
@@ -18,8 +18,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        libgccjit_version:
-          - { gcc: "libgccjit.so", artifacts_branch: "master" }
         cargo_runner: [
           "sde -future -rtm_mode full --",
           "",
@@ -54,18 +52,17 @@ jobs:
       uses: dawidd6/action-download-artifact@v2
       with:
           workflow: main.yml
-          name: ${{ matrix.libgccjit_version.gcc }}
-          path: gcc-build
+          name: gcc-13
+          path: gcc-13
           repo: antoyo/gcc
-          branch: ${{ matrix.libgccjit_version.artifacts_branch }}
+          branch: "master"
           event: push
           search_artifacts: true # Because, instead, the action only check the last job ran and that won't work since we want multiple artifacts.
 
     - name: Setup path to libgccjit
       run: |
-          echo $(readlink -f gcc-build) > gcc_path
-          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
-          ln gcc-build/libgccjit.so gcc-build/libgccjit.so.0
+          sudo dpkg --force-overwrite -i gcc-13/gcc-13.deb
+          echo /usr/lib/ > gcc_path
 
     - name: Set env
       run: |
@@ -139,4 +136,5 @@ jobs:
       if: ${{ matrix.cargo_runner }}
       run: |
         cd build_sysroot/sysroot_src/library/stdarch/
+        # FIXME: these tests fail when the sysroot is compiled with LTO because of a missing symbol in proc-macro.
         STDARCH_TEST_EVERYTHING=1 CHANNEL=release CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="${{ matrix.cargo_runner }}" TARGET=x86_64-unknown-linux-gnu CG_RUSTFLAGS="-Ainternal_features" ../../../../cargo.sh test -- --skip rtm --skip tbm --skip sse4a
diff --git a/Cargo.lock b/Cargo.lock
index 40da783dbe8..404fb9c6db1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -147,6 +147,15 @@ dependencies = [
 ]
 
 [[package]]
+name = "object"
+version = "0.30.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03b4680b86d9cfafba8fc491dc9b6df26b68cf40e9e6cd73909194759a63c385"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
 name = "redox_syscall"
 version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -178,6 +187,7 @@ version = "0.1.0"
 dependencies = [
  "gccjit",
  "lang_tester",
+ "object",
  "smallvec",
  "tempfile",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 3bf629fc662..51fab147aa2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,6 +27,10 @@ gccjit = { git = "https://github.com/antoyo/gccjit.rs" }
 # Local copy.
 #gccjit = { path = "../gccjit.rs" }
 
+object = { version = "0.30.1", default-features = false, features = [
+    "std",
+    "read",
+] }
 smallvec = { version = "1.6.1", features = ["union", "may_dangle"] }
 # TODO(antoyo): make tempfile optional.
 tempfile = "3.7.1"
diff --git a/Readme.md b/Readme.md
index 47fb840efb9..daee6e8588d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -91,6 +91,9 @@ $ CHANNEL="release" $CG_GCCJIT_DIR/cargo.sh run
 
 If you compiled cg_gccjit in debug mode (aka you didn't pass `--release` to `./test.sh`) you should use `CHANNEL="debug"` instead or omit `CHANNEL="release"` completely.
 
+To use LTO, you need to set the variable `FAT_LTO=1` and `EMBED_LTO_BITCODE=1` in addition to setting `lto = "fat"` in the `Cargo.toml`.
+Don't set `FAT_LTO` when compiling the sysroot, though: only set `EMBED_LTO_BITCODE=1`.
+
 ### Rustc
 
 > You should prefer using the Cargo method.
@@ -191,6 +194,48 @@ set substitute-path /usr/src/debug/gcc /path/to/gcc-repo/gcc
 
 TODO(antoyo): but that's not what I remember I was doing.
 
+### `failed to build archive` error
+
+When you get this error:
+
+```
+error: failed to build archive: failed to open object file: No such file or directory (os error 2)
+```
+
+That can be caused by the fact that you try to compile with `lto = "fat"`, but you didn't compile the sysroot with LTO.
+(Not sure if that's the reason since I cannot reproduce anymore. Maybe it happened when forgetting setting `FAT_LTO`.)
+
+### How to debug GCC LTO
+
+Run do the command with `-v -save-temps` and then extract the `lto1` line from the output and run that under the debugger.
+
+### How to send arguments to the GCC linker
+
+```
+CG_RUSTFLAGS="-Clink-args=-save-temps -v" ../cargo.sh build
+```
+
+### How to see the personality functions in the asm dump
+
+```
+CG_RUSTFLAGS="-Clink-arg=-save-temps -v -Clink-arg=-dA" ../cargo.sh build
+```
+
+### How to see the LLVM IR for a sysroot crate
+
+```
+cargo build -v --target x86_64-unknown-linux-gnu -Zbuild-std
+# Take the command from the output and add --emit=llvm-ir
+```
+
+### To prevent the linker from unmangling symbols
+
+Run with:
+
+```
+COLLECT_NO_DEMANGLE=1
+```
+
 ### How to use a custom-build rustc
 
  * Build the stage2 compiler (`rustup toolchain link debug-current build/x86_64-unknown-linux-gnu/stage2`).
@@ -253,4 +298,4 @@ generate it in [gimple.md](./doc/gimple.md).
  * Set `linker='-Clinker=m68k-linux-gcc'`.
  * Set the path to the cross-compiling libgccjit in `gcc_path`.
  * Comment the line: `context.add_command_line_option("-masm=intel");` in src/base.rs.
- * (might not be necessary) Disable the compilation of libstd.so (and possibly libcore.so?).
+ * (might not be necessary) Disable the compilation of libstd.so (and possibly libcore.so?): Remove dylib from build_sysroot/sysroot_src/library/std/Cargo.toml.
diff --git a/build_sysroot/Cargo.toml b/build_sysroot/Cargo.toml
index dca2ffdc24b..e5658273c97 100644
--- a/build_sysroot/Cargo.toml
+++ b/build_sysroot/Cargo.toml
@@ -19,3 +19,4 @@ rustc-std-workspace-std = { path = "./sysroot_src/library/rustc-std-workspace-st
 
 [profile.release]
 debug = true
+#lto = "fat" # TODO(antoyo): re-enable when the failing LTO tests regarding proc-macros are fixed.
diff --git a/config.sh b/config.sh
index 166e83901c4..ecc6d56b00e 100644
--- a/config.sh
+++ b/config.sh
@@ -38,7 +38,14 @@ if [[ "$HOST_TRIPLE" != "$TARGET_TRIPLE" ]]; then
    fi
 fi
 
-export RUSTFLAGS="$CG_RUSTFLAGS $linker -Csymbol-mangling-version=v0 -Cdebuginfo=2 -Clto=off -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot $TEST_FLAGS"
+# Since we don't support ThinLTO, disable LTO completely when not trying to do LTO.
+# TODO(antoyo): remove when we can handle ThinLTO.
+disable_lto_flags=''
+if [[ ! -v FAT_LTO ]]; then
+    disable_lto_flags='-Clto=off'
+fi
+
+export RUSTFLAGS="$CG_RUSTFLAGS $linker -Csymbol-mangling-version=v0 -Cdebuginfo=2 $disable_lto_flags -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot $TEST_FLAGS"
 
 # FIXME(antoyo): remove once the atomic shim is gone
 if [[ `uname` == 'Darwin' ]]; then
@@ -50,3 +57,7 @@ export RUSTC_LOG=warn # display metadata load errors
 
 export LD_LIBRARY_PATH="$(pwd)/target/out:$(pwd)/build_sysroot/sysroot/lib/rustlib/$TARGET_TRIPLE/lib:$GCC_PATH"
 export DYLD_LIBRARY_PATH=$LD_LIBRARY_PATH
+# NOTE: To avoid the -fno-inline errors, use /opt/gcc/bin/gcc instead of cc.
+# To do so, add a symlink for cc to /opt/gcc/bin/gcc in our PATH.
+# Another option would be to add the following Rust flag: -Clinker=/opt/gcc/bin/gcc
+export PATH="/opt/gcc/bin:$PATH"
diff --git a/failing-lto-tests.txt b/failing-lto-tests.txt
new file mode 100644
index 00000000000..2e0b6134070
--- /dev/null
+++ b/failing-lto-tests.txt
@@ -0,0 +1,23 @@
+tests/ui/lint/unsafe_code/forge_unsafe_block.rs
+tests/ui/lint/unused-qualification-in-derive-expansion.rs
+tests/ui/macro-quote-test.rs
+tests/ui/macros/proc_macro.rs
+tests/ui/panic-runtime/lto-unwind.rs
+tests/ui/resolve/derive-macro-1.rs
+tests/ui/resolve/derive-macro-2.rs
+tests/ui/rfcs/rfc-2565-param-attrs/param-attrs-pretty.rs
+tests/ui/rfcs/rfc-2565-param-attrs/issue-64682-dropping-first-attrs-in-impl-fns.rs
+tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs
+tests/ui/rust-2018/suggestions-not-always-applicable.rs
+tests/ui/rust-2021/reserved-prefixes-via-macro.rs
+tests/ui/underscore-imports/duplicate.rs
+tests/ui/async-await/issues/issue-60674.rs
+tests/ui/attributes/main-removed-2/main.rs
+tests/ui/cfg/assume-incomplete-release/assume-incomplete.rs
+tests/ui/crate-loading/cross-compiled-proc-macro.rs
+tests/ui/derives/derive-marker-tricky.rs
+tests/ui/diagnostic_namespace/existing_proc_macros.rs
+tests/ui/fmt/format-args-capture-issue-106408.rs
+tests/ui/fmt/indoc-issue-106408.rs
+tests/ui/hygiene/issue-77523-def-site-async-await.rs
+tests/ui/inherent-impls-overlap-check/no-overlap.rs
diff --git a/failing-non-lto-tests.txt b/failing-non-lto-tests.txt
new file mode 100644
index 00000000000..2f338f7d3c8
--- /dev/null
+++ b/failing-non-lto-tests.txt
@@ -0,0 +1,11 @@
+tests/ui/issues/issue-44056.rs
+tests/ui/lto/fat-lto.rs
+tests/ui/lto/debuginfo-lto.rs
+tests/ui/lto/lto-many-codegen-units.rs
+tests/ui/lto/issue-100772.rs
+tests/ui/lto/lto-rustc-loads-linker-plugin.rs
+tests/ui/panic-runtime/lto-unwind.rs
+tests/ui/sanitize/issue-111184-generator-witness.rs
+tests/ui/sepcomp/sepcomp-lib-lto.rs
+tests/ui/lto/lto-opt-level-s.rs
+tests/ui/lto/lto-opt-level-z.rs
diff --git a/failing-ui-tests.txt b/failing-ui-tests.txt
index fe0df3347bb..0711ae99a3e 100644
--- a/failing-ui-tests.txt
+++ b/failing-ui-tests.txt
@@ -1,11 +1,5 @@
-tests/ui/allocator/custom-in-block.rs
-tests/ui/allocator/custom-in-submodule.rs
-tests/ui/allocator/custom.rs
-tests/ui/allocator/hygiene.rs
 tests/ui/allocator/no_std-alloc-error-handler-custom.rs
 tests/ui/allocator/no_std-alloc-error-handler-default.rs
-tests/ui/allocator/xcrate-use.rs
-tests/ui/allocator/xcrate-use2.rs
 tests/ui/asm/may_unwind.rs
 tests/ui/asm/x86_64/multiple-clobber-abi.rs
 tests/ui/debuginfo/debuginfo-emit-llvm-ir-and-split-debuginfo.rs
@@ -14,7 +8,6 @@ tests/ui/linkage-attr/linkage1.rs
 tests/ui/lto/dylib-works.rs
 tests/ui/numbers-arithmetic/saturating-float-casts.rs
 tests/ui/polymorphization/promoted-function.rs
-tests/ui/process/nofile-limit.rs
 tests/ui/sepcomp/sepcomp-cci.rs
 tests/ui/sepcomp/sepcomp-extern.rs
 tests/ui/sepcomp/sepcomp-fns-backwards.rs
@@ -53,8 +46,8 @@ tests/ui/issues/issue-40883.rs
 tests/ui/issues/issue-43853.rs
 tests/ui/issues/issue-47364.rs
 tests/ui/macros/rfc-2011-nicer-assert-messages/assert-without-captures-does-not-create-unnecessary-code.rs
-tests/ui/rfcs/rfc-2091-track-caller/std-panic-locations.rs
 tests/ui/rfcs/rfc-1857-stabilize-drop-order/drop-order.rs
+tests/ui/rfcs/rfc-2091-track-caller/std-panic-locations.rs
 tests/ui/simd/issue-17170.rs
 tests/ui/simd/issue-39720.rs
 tests/ui/simd/issue-89193.rs
@@ -64,9 +57,15 @@ tests/ui/alloc-error/default-alloc-error-hook.rs
 tests/ui/generator/panic-safe.rs
 tests/ui/issues/issue-14875.rs
 tests/ui/issues/issue-29948.rs
-tests/ui/panic-while-printing.rs
-tests/ui/enum-discriminant/get_discr.rs
 tests/ui/panics/nested_panic_caught.rs
 tests/ui/simd/intrinsic/generic-bswap-byte.rs
 tests/ui/const_prop/ice-issue-111353.rs
 tests/ui/process/println-with-broken-pipe.rs
+tests/ui/panic-runtime/lto-abort.rs
+tests/ui/lto/thin-lto-inlines2.rs
+tests/ui/lto/weak-works.rs
+tests/ui/lto/thin-lto-inlines.rs
+tests/ui/lto/thin-lto-global-allocator.rs
+tests/ui/lto/msvc-imp-present.rs
+tests/ui/lto/lto-thin-rustc-loads-linker-plugin.rs
+tests/ui/lto/all-crates.rs
diff --git a/locales/en-US.ftl b/locales/en-US.ftl
deleted file mode 100644
index 2181d49eeef..00000000000
--- a/locales/en-US.ftl
+++ /dev/null
@@ -1,65 +0,0 @@
-codegen_gcc_unwinding_inline_asm =
-    GCC backend does not support unwinding from inline asm
-
-codegen_gcc_lto_not_supported =
-    LTO is not supported. You may get a linker error.
-
-codegen_gcc_invalid_monomorphization_basic_integer =
-    invalid monomorphization of `{$name}` intrinsic: expected basic integer type, found `{$ty}`
-
-codegen_gcc_invalid_monomorphization_invalid_float_vector =
-    invalid monomorphization of `{$name}` intrinsic: unsupported element type `{$elem_ty}` of floating-point vector `{$vec_ty}`
-
-codegen_gcc_invalid_monomorphization_not_float =
-    invalid monomorphization of `{$name}` intrinsic: `{$ty}` is not a floating-point type
-
-codegen_gcc_invalid_monomorphization_unrecognized =
-    invalid monomorphization of `{$name}` intrinsic: unrecognized intrinsic `{$name}`
-
-codegen_gcc_invalid_monomorphization_expected_signed_unsigned =
-    invalid monomorphization of `{$name}` intrinsic: expected element type `{$elem_ty}` of vector type `{$vec_ty}` to be a signed or unsigned integer type
-
-codegen_gcc_invalid_monomorphization_unsupported_element =
-    invalid monomorphization of `{$name}` intrinsic: unsupported {$name} from `{$in_ty}` with element `{$elem_ty}` to `{$ret_ty}`
-
-codegen_gcc_invalid_monomorphization_invalid_bitmask =
-    invalid monomorphization of `{$name}` intrinsic: invalid bitmask `{$ty}`, expected `u{$expected_int_bits}` or `[u8; {$expected_bytes}]`
-
-codegen_gcc_invalid_monomorphization_simd_shuffle =
-    invalid monomorphization of `{$name}` intrinsic: simd_shuffle index must be an array of `u32`, got `{$ty}`
-
-codegen_gcc_invalid_monomorphization_expected_simd =
-    invalid monomorphization of `{$name}` intrinsic: expected SIMD {$expected_ty} type, found non-SIMD `{$found_ty}`
-
-codegen_gcc_invalid_monomorphization_mask_type =
-    invalid monomorphization of `{$name}` intrinsic: mask element type is `{$ty}`, expected `i_`
-
-codegen_gcc_invalid_monomorphization_return_length =
-    invalid monomorphization of `{$name}` intrinsic: expected return type of length {$in_len}, found `{$ret_ty}` with length {$out_len}
-
-codegen_gcc_invalid_monomorphization_return_length_input_type =
-    invalid monomorphization of `{$name}` intrinsic: expected return type with length {$in_len} (same as input type `{$in_ty}`), found `{$ret_ty}` with length {$out_len}
-
-codegen_gcc_invalid_monomorphization_return_element =
-    invalid monomorphization of `{$name}` intrinsic: expected return element type `{$in_elem}` (element of input `{$in_ty}`), found `{$ret_ty}` with element type `{$out_ty}`
-
-codegen_gcc_invalid_monomorphization_return_type =
-    invalid monomorphization of `{$name}` intrinsic: expected return type `{$in_elem}` (element of input `{$in_ty}`), found `{$ret_ty}`
-
-codegen_gcc_invalid_monomorphization_inserted_type =
-    invalid monomorphization of `{$name}` intrinsic: expected inserted type `{$in_elem}` (element of input `{$in_ty}`), found `{$out_ty}`
-
-codegen_gcc_invalid_monomorphization_return_integer_type =
-    invalid monomorphization of `{$name}` intrinsic: expected return type with integer elements, found `{$ret_ty}` with non-integer `{$out_ty}`
-
-codegen_gcc_invalid_monomorphization_mismatched_lengths =
-    invalid monomorphization of `{$name}` intrinsic: mismatched lengths: mask length `{$m_len}` != other vector length `{$v_len}`
-
-codegen_gcc_invalid_monomorphization_unsupported_cast =
-    invalid monomorphization of `{$name}` intrinsic: unsupported cast from `{$in_ty}` with element `{$in_elem}` to `{$ret_ty}` with element `{$out_elem}`
-
-codegen_gcc_invalid_monomorphization_unsupported_operation =
-    invalid monomorphization of `{$name}` intrinsic: unsupported operation on `{$in_ty}` with element `{$in_elem}`
-
-codegen_gcc_invalid_minimum_alignment =
-    invalid minimum global alignment: {$err}
diff --git a/messages.ftl b/messages.ftl
index 2fd0daee3e7..de9be3a5528 100644
--- a/messages.ftl
+++ b/messages.ftl
@@ -9,3 +9,17 @@ codegen_gcc_tied_target_features = the target features {$features} must all be e
 
 codegen_gcc_unwinding_inline_asm =
     GCC backend does not support unwinding from inline asm
+
+codegen_gcc_copy_bitcode = failed to copy bitcode to object file: {$err}
+
+codegen_gcc_dynamic_linking_with_lto =
+    cannot prefer dynamic linking when performing LTO
+    .note = only 'staticlib', 'bin', and 'cdylib' outputs are supported with LTO
+
+codegen_gcc_load_bitcode = failed to load bitcode of module "{$name}"
+
+codegen_gcc_lto_disallowed = lto can only be run for executables, cdylibs and static library outputs
+
+codegen_gcc_lto_dylib = lto cannot be used for `dylib` crate type without `-Zdylib-lto`
+
+codegen_gcc_lto_bitcode_from_rlib = failed to get bitcode from object file for LTO ({$gcc_err})
diff --git a/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch b/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch
index 93c63b5dcac..2a55f2cb796 100644
--- a/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch
+++ b/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch
@@ -1,25 +1,26 @@
-From c3821e02fbd6cb5ad6e06d759fccdc9073712375 Mon Sep 17 00:00:00 2001
+From b8f3eed3053c9333b5dfbeaeb2a6a65a4b3156df Mon Sep 17 00:00:00 2001
 From: Antoni Boucher <bouanto@zoho.com>
-Date: Tue, 7 Jun 2022 21:40:13 -0400
-Subject: [PATCH] Add stdarch Cargo.toml for testing
+Date: Tue, 29 Aug 2023 13:06:34 -0400
+Subject: [PATCH] Patch 0001-Add-stdarch-Cargo.toml-for-testing.patch
 
 ---
- library/stdarch/Cargo.toml | 20 ++++++++++++++++++++
- 1 file changed, 20 insertions(+)
+ library/stdarch/Cargo.toml | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
  create mode 100644 library/stdarch/Cargo.toml
 
 diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
 new file mode 100644
-index 0000000..fbe0a95
+index 0000000..4c63700
 --- /dev/null
 +++ b/library/stdarch/Cargo.toml
-@@ -0,0 +1,20 @@
+@@ -0,0 +1,21 @@
 +[workspace]
++resolver = "1"
 +members = [
 +  "crates/core_arch",
 +  "crates/std_detect",
 +  "crates/stdarch-gen",
-+  "examples/"
++  #"examples/"
 +]
 +exclude = [
 +  "crates/wasm-assert-instr-tests"
@@ -35,5 +36,5 @@ index 0000000..fbe0a95
 +opt-level = 3
 +incremental = true
 -- 
-2.26.2.7.g19db9cfb68.dirty
+2.42.0
 
diff --git a/patches/0001-Disable-examples.patch b/patches/0001-Disable-examples.patch
deleted file mode 100644
index 1b71df1ca8d..00000000000
--- a/patches/0001-Disable-examples.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From a2d53a324a02c04b76c0e9d39dc15cd443a3b8b2 Mon Sep 17 00:00:00 2001
-From: Antoni Boucher <bouanto@zoho.com>
-Date: Fri, 25 Nov 2022 11:18:11 -0500
-Subject: [PATCH] Disable examples
-
----
- library/stdarch/Cargo.toml | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
-index fbe0a95..748d72d 100644
---- a/library/stdarch/Cargo.toml
-+++ b/library/stdarch/Cargo.toml
-@@ -3,7 +3,7 @@ members = [
-   "crates/core_arch",
-   "crates/std_detect",
-   "crates/stdarch-gen",
--  "examples/"
-+  #"examples/"
- ]
- exclude = [
-   "crates/wasm-assert-instr-tests"
--- 
-2.26.2.7.g19db9cfb68.dirty
-
diff --git a/src/back/lto.rs b/src/back/lto.rs
new file mode 100644
index 00000000000..529454b119e
--- /dev/null
+++ b/src/back/lto.rs
@@ -0,0 +1,341 @@
+/// GCC requires to use the same toolchain for the whole compilation when doing LTO.
+/// So, we need the same version/commit of the linker (gcc) and lto front-end binaries (lto1,
+/// lto-wrapper, liblto_plugin.so).
+
+// FIXME(antoyo): the executables compiled with LTO are bigger than those compiled without LTO.
+// Since it is the opposite for cg_llvm, check if this is normal.
+//
+// Maybe we embed the bitcode in the final binary?
+// It doesn't look like we try to generate fat objects for the final binary.
+// Check if the way we combine the object files make it keep the LTO sections on the final link.
+// Maybe that's because the combined object files contain the IR (true) and the final link
+// does not remove it?
+//
+// TODO(antoyo): for performance, check which optimizations the C++ frontend enables.
+//
+// Fix these warnings:
+// /usr/bin/ld: warning: type of symbol `_RNvNvNvNtCs5JWOrf9uCus_5rayon11thread_pool19WORKER_THREAD_STATE7___getit5___KEY' changed from 1 to 6 in /tmp/ccKeUSiR.ltrans0.ltrans.o
+// /usr/bin/ld: warning: type of symbol `_RNvNvNvNvNtNtNtCsAj5i4SGTR7_3std4sync4mpmc5waker17current_thread_id5DUMMY7___getit5___KEY' changed from 1 to 6 in /tmp/ccKeUSiR.ltrans0.ltrans.o
+// /usr/bin/ld: warning: incremental linking of LTO and non-LTO objects; using -flinker-output=nolto-rel which will bypass whole program optimization
+
+use std::ffi::CString;
+use std::fs::{self, File};
+use std::path::{Path, PathBuf};
+
+use gccjit::OutputKind;
+use object::read::archive::ArchiveFile;
+use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule};
+use rustc_codegen_ssa::back::symbol_export;
+use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
+use rustc_codegen_ssa::traits::*;
+use rustc_codegen_ssa::{looks_like_rust_object_file, ModuleCodegen, ModuleKind};
+use rustc_data_structures::memmap::Mmap;
+use rustc_errors::{FatalError, Handler};
+use rustc_hir::def_id::LOCAL_CRATE;
+use rustc_middle::dep_graph::WorkProduct;
+use rustc_middle::middle::exported_symbols::{SymbolExportInfo, SymbolExportLevel};
+use rustc_session::config::{CrateType, Lto};
+use tempfile::{TempDir, tempdir};
+
+use crate::back::write::save_temp_bitcode;
+use crate::errors::{
+    DynamicLinkingWithLTO, LtoBitcodeFromRlib, LtoDisallowed, LtoDylib,
+};
+use crate::{GccCodegenBackend, GccContext, to_gcc_opt_level};
+
+/// We keep track of the computed LTO cache keys from the previous
+/// session to determine which CGUs we can reuse.
+//pub const THIN_LTO_KEYS_INCR_COMP_FILE_NAME: &str = "thin-lto-past-keys.bin";
+
+pub fn crate_type_allows_lto(crate_type: CrateType) -> bool {
+    match crate_type {
+        CrateType::Executable | CrateType::Dylib | CrateType::Staticlib | CrateType::Cdylib => true,
+        CrateType::Rlib | CrateType::ProcMacro => false,
+    }
+}
+
+struct LtoData {
+    // TODO(antoyo): use symbols_below_threshold.
+    //symbols_below_threshold: Vec<CString>,
+    upstream_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
+    tmp_path: TempDir,
+}
+
+fn prepare_lto(cgcx: &CodegenContext<GccCodegenBackend>, diag_handler: &Handler) -> Result<LtoData, FatalError> {
+    let export_threshold = match cgcx.lto {
+        // We're just doing LTO for our one crate
+        Lto::ThinLocal => SymbolExportLevel::Rust,
+
+        // We're doing LTO for the entire crate graph
+        Lto::Fat | Lto::Thin => symbol_export::crates_export_threshold(&cgcx.crate_types),
+
+        Lto::No => panic!("didn't request LTO but we're doing LTO"),
+    };
+
+    let tmp_path =
+        match tempdir() {
+            Ok(tmp_path) => tmp_path,
+            Err(error) => {
+                eprintln!("Cannot create temporary directory: {}", error);
+                return Err(FatalError);
+            },
+        };
+
+    let symbol_filter = &|&(ref name, info): &(String, SymbolExportInfo)| {
+        if info.level.is_below_threshold(export_threshold) || info.used {
+            Some(CString::new(name.as_str()).unwrap())
+        } else {
+            None
+        }
+    };
+    let exported_symbols = cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
+    let mut symbols_below_threshold = {
+        let _timer = cgcx.prof.generic_activity("GCC_lto_generate_symbols_below_threshold");
+        exported_symbols[&LOCAL_CRATE].iter().filter_map(symbol_filter).collect::<Vec<CString>>()
+    };
+    info!("{} symbols to preserve in this crate", symbols_below_threshold.len());
+
+    // If we're performing LTO for the entire crate graph, then for each of our
+    // upstream dependencies, find the corresponding rlib and load the bitcode
+    // from the archive.
+    //
+    // We save off all the bytecode and GCC module file path for later processing
+    // with either fat or thin LTO
+    let mut upstream_modules = Vec::new();
+    if cgcx.lto != Lto::ThinLocal {
+        // Make sure we actually can run LTO
+        for crate_type in cgcx.crate_types.iter() {
+            if !crate_type_allows_lto(*crate_type) {
+                diag_handler.emit_err(LtoDisallowed);
+                return Err(FatalError);
+            } else if *crate_type == CrateType::Dylib {
+                if !cgcx.opts.unstable_opts.dylib_lto {
+                    diag_handler.emit_err(LtoDylib);
+                    return Err(FatalError);
+                }
+            }
+        }
+
+        if cgcx.opts.cg.prefer_dynamic && !cgcx.opts.unstable_opts.dylib_lto {
+            diag_handler.emit_err(DynamicLinkingWithLTO);
+            return Err(FatalError);
+        }
+
+        for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
+            let exported_symbols =
+                cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
+            {
+                let _timer =
+                    cgcx.prof.generic_activity("GCC_lto_generate_symbols_below_threshold");
+                symbols_below_threshold
+                    .extend(exported_symbols[&cnum].iter().filter_map(symbol_filter));
+            }
+
+            let archive_data = unsafe {
+                Mmap::map(File::open(&path).expect("couldn't open rlib"))
+                    .expect("couldn't map rlib")
+            };
+            let archive = ArchiveFile::parse(&*archive_data).expect("wanted an rlib");
+            let obj_files = archive
+                .members()
+                .filter_map(|child| {
+                    child.ok().and_then(|c| {
+                        std::str::from_utf8(c.name()).ok().map(|name| (name.trim(), c))
+                    })
+                })
+                .filter(|&(name, _)| looks_like_rust_object_file(name));
+            for (name, child) in obj_files {
+                info!("adding bitcode from {}", name);
+                let path = tmp_path.path().join(name);
+                match save_as_file(child.data(&*archive_data).expect("corrupt rlib"), &path) {
+                    Ok(()) => {
+                        let buffer = ModuleBuffer::new(path);
+                        let module = SerializedModule::Local(buffer);
+                        upstream_modules.push((module, CString::new(name).unwrap()));
+                    }
+                    Err(e) => {
+                        diag_handler.emit_err(e);
+                        return Err(FatalError);
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(LtoData {
+        //symbols_below_threshold,
+        upstream_modules,
+        tmp_path,
+    })
+}
+
+fn save_as_file(obj: &[u8], path: &Path) -> Result<(), LtoBitcodeFromRlib> {
+    fs::write(path, obj)
+        .map_err(|error| LtoBitcodeFromRlib {
+            gcc_err: format!("write object file to temp dir: {}", error)
+        })
+}
+
+/// Performs fat LTO by merging all modules into a single one and returning it
+/// for further optimization.
+pub(crate) fn run_fat(
+    cgcx: &CodegenContext<GccCodegenBackend>,
+    modules: Vec<FatLtoInput<GccCodegenBackend>>,
+    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
+) -> Result<LtoModuleCodegen<GccCodegenBackend>, FatalError> {
+    let diag_handler = cgcx.create_diag_handler();
+    let lto_data = prepare_lto(cgcx, &diag_handler)?;
+    /*let symbols_below_threshold =
+        lto_data.symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();*/
+    fat_lto(cgcx, &diag_handler, modules, cached_modules, lto_data.upstream_modules, lto_data.tmp_path,
+        //&symbols_below_threshold,
+    )
+}
+
+fn fat_lto(cgcx: &CodegenContext<GccCodegenBackend>, _diag_handler: &Handler, modules: Vec<FatLtoInput<GccCodegenBackend>>, cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>, mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>, tmp_path: TempDir,
+    //symbols_below_threshold: &[*const libc::c_char],
+) -> Result<LtoModuleCodegen<GccCodegenBackend>, FatalError> {
+    let _timer = cgcx.prof.generic_activity("GCC_fat_lto_build_monolithic_module");
+    info!("going for a fat lto");
+
+    // Sort out all our lists of incoming modules into two lists.
+    //
+    // * `serialized_modules` (also and argument to this function) contains all
+    //   modules that are serialized in-memory.
+    // * `in_memory` contains modules which are already parsed and in-memory,
+    //   such as from multi-CGU builds.
+    //
+    // All of `cached_modules` (cached from previous incremental builds) can
+    // immediately go onto the `serialized_modules` modules list and then we can
+    // split the `modules` array into these two lists.
+    let mut in_memory = Vec::new();
+    serialized_modules.extend(cached_modules.into_iter().map(|(buffer, wp)| {
+        info!("pushing cached module {:?}", wp.cgu_name);
+        (buffer, CString::new(wp.cgu_name).unwrap())
+    }));
+    for module in modules {
+        match module {
+            FatLtoInput::InMemory(m) => in_memory.push(m),
+            FatLtoInput::Serialized { name, buffer } => {
+                info!("pushing serialized module {:?}", name);
+                let buffer = SerializedModule::Local(buffer);
+                serialized_modules.push((buffer, CString::new(name).unwrap()));
+            }
+        }
+    }
+
+    // Find the "costliest" module and merge everything into that codegen unit.
+    // All the other modules will be serialized and reparsed into the new
+    // context, so this hopefully avoids serializing and parsing the largest
+    // codegen unit.
+    //
+    // Additionally use a regular module as the base here to ensure that various
+    // file copy operations in the backend work correctly. The only other kind
+    // of module here should be an allocator one, and if your crate is smaller
+    // than the allocator module then the size doesn't really matter anyway.
+    let costliest_module = in_memory
+        .iter()
+        .enumerate()
+        .filter(|&(_, module)| module.kind == ModuleKind::Regular)
+        .map(|(i, _module)| {
+            //let cost = unsafe { llvm::LLVMRustModuleCost(module.module_llvm.llmod()) };
+            // TODO(antoyo): compute the cost of a module if GCC allows this.
+            (0, i)
+        })
+        .max();
+
+    // If we found a costliest module, we're good to go. Otherwise all our
+    // inputs were serialized which could happen in the case, for example, that
+    // all our inputs were incrementally reread from the cache and we're just
+    // re-executing the LTO passes. If that's the case deserialize the first
+    // module and create a linker with it.
+    let mut module: ModuleCodegen<GccContext> = match costliest_module {
+        Some((_cost, i)) => in_memory.remove(i),
+        None => {
+            unimplemented!("Incremental");
+            /*assert!(!serialized_modules.is_empty(), "must have at least one serialized module");
+            let (buffer, name) = serialized_modules.remove(0);
+            info!("no in-memory regular modules to choose from, parsing {:?}", name);
+            ModuleCodegen {
+                module_llvm: GccContext::parse(cgcx, &name, buffer.data(), diag_handler)?,
+                name: name.into_string().unwrap(),
+                kind: ModuleKind::Regular,
+            }*/
+        }
+    };
+    let mut serialized_bitcode = Vec::new();
+    {
+        info!("using {:?} as a base module", module.name);
+
+        // We cannot load and merge GCC contexts in memory like cg_llvm is doing.
+        // Instead, we combine the object files into a single object file.
+        for module in in_memory {
+            let path = tmp_path.path().to_path_buf().join(&module.name);
+            let path = path.to_str().expect("path");
+            let context = &module.module_llvm.context;
+            let config = cgcx.config(module.kind);
+            // NOTE: we need to set the optimization level here in order for LTO to do its job.
+            context.set_optimization_level(to_gcc_opt_level(config.opt_level));
+            context.add_command_line_option("-flto=auto");
+            context.add_command_line_option("-flto-partition=one");
+            context.compile_to_file(OutputKind::ObjectFile, path);
+            let buffer = ModuleBuffer::new(PathBuf::from(path));
+            let llmod_id = CString::new(&module.name[..]).unwrap();
+            serialized_modules.push((SerializedModule::Local(buffer), llmod_id));
+        }
+        // Sort the modules to ensure we produce deterministic results.
+        serialized_modules.sort_by(|module1, module2| module1.1.cmp(&module2.1));
+
+        // We add the object files and save in should_combine_object_files that we should combine
+        // them into a single object file when compiling later.
+        for (bc_decoded, name) in serialized_modules {
+            let _timer = cgcx
+                .prof
+                .generic_activity_with_arg_recorder("GCC_fat_lto_link_module", |recorder| {
+                    recorder.record_arg(format!("{:?}", name))
+                });
+            info!("linking {:?}", name);
+            match bc_decoded {
+                SerializedModule::Local(ref module_buffer) => {
+                    module.module_llvm.should_combine_object_files = true;
+                    module.module_llvm.context.add_driver_option(module_buffer.0.to_str().expect("path"));
+                },
+                SerializedModule::FromRlib(_) => unimplemented!("from rlib"),
+                SerializedModule::FromUncompressedFile(_) => unimplemented!("from uncompressed file"),
+            }
+            serialized_bitcode.push(bc_decoded);
+        }
+        save_temp_bitcode(cgcx, &module, "lto.input");
+
+        // Internalize everything below threshold to help strip out more modules and such.
+        /*unsafe {
+            let ptr = symbols_below_threshold.as_ptr();
+            llvm::LLVMRustRunRestrictionPass(
+                llmod,
+                ptr as *const *const libc::c_char,
+                symbols_below_threshold.len() as libc::size_t,
+            );*/
+            save_temp_bitcode(cgcx, &module, "lto.after-restriction");
+        //}
+    }
+
+    // NOTE: save the temporary directory used by LTO so that it gets deleted after linking instead
+    // of now.
+    module.module_llvm.temp_dir = Some(tmp_path);
+
+    Ok(LtoModuleCodegen::Fat { module, _serialized_bitcode: serialized_bitcode })
+}
+
+pub struct ModuleBuffer(PathBuf);
+
+impl ModuleBuffer {
+    pub fn new(path: PathBuf) -> ModuleBuffer {
+        ModuleBuffer(path)
+    }
+}
+
+impl ModuleBufferMethods for ModuleBuffer {
+    fn data(&self) -> &[u8] {
+        unimplemented!("data not needed for GCC codegen");
+    }
+}
diff --git a/src/back/mod.rs b/src/back/mod.rs
index d692799d764..10187eab0d7 100644
--- a/src/back/mod.rs
+++ b/src/back/mod.rs
@@ -1 +1,2 @@
+pub mod lto;
 pub mod write;
diff --git a/src/back/write.rs b/src/back/write.rs
index 5f54ac4ebc6..04772d7707a 100644
--- a/src/back/write.rs
+++ b/src/back/write.rs
@@ -2,27 +2,71 @@ use std::{env, fs};
 
 use gccjit::OutputKind;
 use rustc_codegen_ssa::{CompiledModule, ModuleCodegen};
-use rustc_codegen_ssa::back::write::{CodegenContext, EmitObj, ModuleConfig};
+use rustc_codegen_ssa::back::link::ensure_removed;
+use rustc_codegen_ssa::back::write::{BitcodeSection, CodegenContext, EmitObj, ModuleConfig};
 use rustc_errors::Handler;
+use rustc_fs_util::link_or_copy;
 use rustc_session::config::OutputType;
 use rustc_span::fatal_error::FatalError;
 use rustc_target::spec::SplitDebuginfo;
 
 use crate::{GccCodegenBackend, GccContext};
+use crate::errors::CopyBitcode;
 
-pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_handler: &Handler, module: ModuleCodegen<GccContext>, config: &ModuleConfig) -> Result<CompiledModule, FatalError> {
-    let _timer = cgcx.prof.generic_activity_with_arg("LLVM_module_codegen", &*module.name);
+pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, diag_handler: &Handler, module: ModuleCodegen<GccContext>, config: &ModuleConfig) -> Result<CompiledModule, FatalError> {
+    let _timer = cgcx.prof.generic_activity_with_arg("GCC_module_codegen", &*module.name);
     {
         let context = &module.module_llvm.context;
 
         let module_name = module.name.clone();
+
+        let should_combine_object_files = module.module_llvm.should_combine_object_files;
+
         let module_name = Some(&module_name[..]);
 
-        let _bc_out = cgcx.output_filenames.temp_path(OutputType::Bitcode, module_name);
+        // NOTE: Only generate object files with GIMPLE when this environment variable is set for
+        // now because this requires a particular setup (same gcc/lto1/lto-wrapper commit as libgccjit).
+        let fat_lto = env::var("EMBED_LTO_BITCODE").as_deref() == Ok("1");
+
+        let bc_out = cgcx.output_filenames.temp_path(OutputType::Bitcode, module_name);
         let obj_out = cgcx.output_filenames.temp_path(OutputType::Object, module_name);
 
-        if config.bitcode_needed() {
+        if config.bitcode_needed() && fat_lto {
+            let _timer = cgcx
+                .prof
+                .generic_activity_with_arg("GCC_module_codegen_make_bitcode", &*module.name);
+
             // TODO(antoyo)
+            /*if let Some(bitcode_filename) = bc_out.file_name() {
+                cgcx.prof.artifact_size(
+                    "llvm_bitcode",
+                    bitcode_filename.to_string_lossy(),
+                    data.len() as u64,
+                );
+            }*/
+
+            if config.emit_bc || config.emit_obj == EmitObj::Bitcode {
+                let _timer = cgcx
+                    .prof
+                    .generic_activity_with_arg("GCC_module_codegen_emit_bitcode", &*module.name);
+                context.add_command_line_option("-flto=auto");
+                context.add_command_line_option("-flto-partition=one");
+                context.compile_to_file(OutputKind::ObjectFile, bc_out.to_str().expect("path to str"));
+            }
+
+            if config.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full) {
+                let _timer = cgcx
+                    .prof
+                    .generic_activity_with_arg("GCC_module_codegen_embed_bitcode", &*module.name);
+                // TODO(antoyo): maybe we should call embed_bitcode to have the proper iOS fixes?
+                //embed_bitcode(cgcx, llcx, llmod, &config.bc_cmdline, data);
+
+                context.add_command_line_option("-flto=auto");
+                context.add_command_line_option("-flto-partition=one");
+                context.add_command_line_option("-ffat-lto-objects");
+                // TODO(antoyo): Send -plugin/usr/lib/gcc/x86_64-pc-linux-gnu/11.1.0/liblto_plugin.so to linker (this should be done when specifying the appropriate rustc cli argument).
+                context.compile_to_file(OutputKind::ObjectFile, bc_out.to_str().expect("path to str"));
+            }
         }
 
         if config.emit_ir {
@@ -32,7 +76,7 @@ pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_han
         if config.emit_asm {
             let _timer = cgcx
                 .prof
-                .generic_activity_with_arg("LLVM_module_codegen_emit_asm", &*module.name);
+                .generic_activity_with_arg("GCC_module_codegen_emit_asm", &*module.name);
             let path = cgcx.output_filenames.temp_path(OutputType::Assembly, module_name);
             context.compile_to_file(OutputKind::Assembler, path.to_str().expect("path to str"));
         }
@@ -41,7 +85,7 @@ pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_han
             EmitObj::ObjectCode(_) => {
                 let _timer = cgcx
                     .prof
-                    .generic_activity_with_arg("LLVM_module_codegen_emit_obj", &*module.name);
+                    .generic_activity_with_arg("GCC_module_codegen_emit_obj", &*module.name);
                 if env::var("CG_GCCJIT_DUMP_MODULE_NAMES").as_deref() == Ok("1") {
                     println!("Module {}", module.name);
                 }
@@ -60,11 +104,36 @@ pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_han
                     context.set_debug_info(true);
                     context.dump_to_file(path, true);
                 }
-                context.compile_to_file(OutputKind::ObjectFile, obj_out.to_str().expect("path to str"));
+                if should_combine_object_files && fat_lto {
+                    context.add_command_line_option("-flto=auto");
+                    context.add_command_line_option("-flto-partition=one");
+
+                    context.add_driver_option("-Wl,-r");
+                    // NOTE: we need -nostdlib, otherwise, we get the following error:
+                    // /usr/bin/ld: cannot find -lgcc_s: No such file or directory
+                    context.add_driver_option("-nostdlib");
+                    // NOTE: without -fuse-linker-plugin, we get the following error:
+                    // lto1: internal compiler error: decompressed stream: Destination buffer is too small
+                    context.add_driver_option("-fuse-linker-plugin");
+
+                    // NOTE: this doesn't actually generate an executable. With the above flags, it combines the .o files together in another .o.
+                    context.compile_to_file(OutputKind::Executable, obj_out.to_str().expect("path to str"));
+                }
+                else {
+                    context.compile_to_file(OutputKind::ObjectFile, obj_out.to_str().expect("path to str"));
+                }
             }
 
             EmitObj::Bitcode => {
-                // TODO(antoyo)
+                debug!("copying bitcode {:?} to obj {:?}", bc_out, obj_out);
+                if let Err(err) = link_or_copy(&bc_out, &obj_out) {
+                    diag_handler.emit_err(CopyBitcode { err });
+                }
+
+                if !config.emit_bc {
+                    debug!("removing_bitcode {:?}", bc_out);
+                    ensure_removed(diag_handler, &bc_out);
+                }
             }
 
             EmitObj::None => {}
@@ -82,3 +151,18 @@ pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_han
 pub(crate) fn link(_cgcx: &CodegenContext<GccCodegenBackend>, _diag_handler: &Handler, mut _modules: Vec<ModuleCodegen<GccContext>>) -> Result<ModuleCodegen<GccContext>, FatalError> {
     unimplemented!();
 }
+
+pub(crate) fn save_temp_bitcode(cgcx: &CodegenContext<GccCodegenBackend>, _module: &ModuleCodegen<GccContext>, _name: &str) {
+    if !cgcx.save_temps {
+        return;
+    }
+    unimplemented!();
+    /*unsafe {
+        let ext = format!("{}.bc", name);
+        let cgu = Some(&module.name[..]);
+        let path = cgcx.output_filenames.temp_path_ext(&ext, cgu);
+        let cstr = path_to_c_string(&path);
+        let llmod = module.module_llvm.llmod();
+        llvm::LLVMWriteBitcodeToFile(llmod, cstr.as_ptr());
+    }*/
+}
diff --git a/src/base.rs b/src/base.rs
index bf0309fea14..266d60da10c 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -56,6 +56,7 @@ pub fn global_linkage_to_gcc(linkage: Linkage) -> GlobalKind {
 pub fn linkage_to_gcc(linkage: Linkage) -> FunctionType {
     match linkage {
         Linkage::External => FunctionType::Exported,
+        // TODO(antoyo): set the attribute externally_visible.
         Linkage::AvailableExternally => FunctionType::Extern,
         Linkage::LinkOnceAny => unimplemented!(),
         Linkage::LinkOnceODR => unimplemented!(),
@@ -91,7 +92,6 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, target_info: Arc<
     fn module_codegen(tcx: TyCtxt<'_>, (cgu_name, target_info): (Symbol, Arc<TargetInfo>)) -> ModuleCodegen<GccContext> {
         let cgu = tcx.codegen_unit(cgu_name);
         // Instantiate monomorphizations without filling out definitions yet...
-        //let llvm_module = ModuleLlvm::new(tcx, &cgu_name.as_str());
         let context = Context::default();
 
         context.add_command_line_option("-fexceptions");
@@ -152,7 +152,10 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, target_info: Arc<
             context.add_command_line_option("-fdump-rtl-all");
         }
         if env::var("CG_GCCJIT_DUMP_TREE_ALL").as_deref() == Ok("1") {
-            context.add_command_line_option("-fdump-tree-all");
+            context.add_command_line_option("-fdump-tree-all-eh");
+        }
+        if env::var("CG_GCCJIT_DUMP_IPA_ALL").as_deref() == Ok("1") {
+            context.add_command_line_option("-fdump-ipa-all-eh");
         }
         if env::var("CG_GCCJIT_DUMP_CODE").as_deref() == Ok("1") {
             context.set_dump_code_on_compile(true);
@@ -168,6 +171,10 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, target_info: Arc<
             context.set_keep_intermediates(true);
         }
 
+        if env::var("CG_GCCJIT_VERBOSE").as_deref() == Ok("1") {
+            context.add_driver_option("-v");
+        }
+
         // NOTE: The codegen generates unrechable blocks.
         context.set_allow_unreachable_blocks(true);
 
@@ -197,7 +204,9 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, target_info: Arc<
         ModuleCodegen {
             name: cgu_name.to_string(),
             module_llvm: GccContext {
-                context
+                context,
+                should_combine_object_files: false,
+                temp_dir: None,
             },
             kind: ModuleKind::Regular,
         }
diff --git a/src/declare.rs b/src/declare.rs
index 493626c3cf5..e673d0af4c7 100644
--- a/src/declare.rs
+++ b/src/declare.rs
@@ -1,4 +1,6 @@
 use gccjit::{Function, FunctionType, GlobalKind, LValue, RValue, Type};
+#[cfg(feature="master")]
+use gccjit::{FnAttribute, ToRValue};
 use rustc_codegen_ssa::traits::BaseTypeMethods;
 use rustc_middle::ty::Ty;
 use rustc_span::Symbol;
@@ -114,6 +116,44 @@ fn declare_raw_fn<'gcc>(cx: &CodegenCx<'gcc, '_>, name: &str, _callconv: () /*ll
                 .collect();
             let func = cx.context.new_function(None, cx.linkage.get(), return_type, &params, mangle_name(name), variadic);
             cx.functions.borrow_mut().insert(name.to_string(), func);
+
+            #[cfg(feature="master")]
+            if name == "rust_eh_personality" {
+                // NOTE: GCC will sometimes change the personality function set on a function from
+                // rust_eh_personality to __gcc_personality_v0 as an optimization.
+                // As such, we need to create a weak alias from __gcc_personality_v0 to
+                // rust_eh_personality in order to avoid a linker error.
+                // This needs to be weak in order to still allow using the standard
+                // __gcc_personality_v0 when the linking to it.
+                // Since aliases don't work (maybe because of a bug in LTO partitioning?), we
+                // create a wrapper function that calls rust_eh_personality.
+
+                let params: Vec<_> = param_types.into_iter().enumerate()
+                    .map(|(index, param)| cx.context.new_parameter(None, *param, &format!("param{}", index))) // TODO(antoyo): set name.
+                    .collect();
+                let gcc_func = cx.context.new_function(None, FunctionType::Exported, return_type, &params, "__gcc_personality_v0", variadic);
+
+                // We need a normal extern function for the crates that access rust_eh_personality
+                // without defining it, otherwise we'll get a compiler error.
+                //
+                // For the crate defining it, that needs to be a weak alias instead.
+                gcc_func.add_attribute(FnAttribute::Weak);
+
+                let block = gcc_func.new_block("start");
+                let mut args = vec![];
+                for param in &params {
+                    args.push(param.to_rvalue());
+                }
+                let call = cx.context.new_call(None, func, &args);
+                if return_type == cx.type_void() {
+                    block.add_eval(None, call);
+                    block.end_with_void_return(None);
+                }
+                else {
+                    block.end_with_return(None, call);
+                }
+            }
+
             func
         };
 
diff --git a/src/errors.rs b/src/errors.rs
index 693367192b1..19a967cb489 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -40,3 +40,34 @@ pub(crate) struct TiedTargetFeatures {
     pub span: Span,
     pub features: String,
 }
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_copy_bitcode)]
+pub(crate) struct CopyBitcode {
+    pub err: std::io::Error,
+}
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_dynamic_linking_with_lto)]
+#[note]
+pub(crate) struct DynamicLinkingWithLTO;
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_load_bitcode)]
+pub(crate) struct LoadBitcode {
+    name: String,
+}
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_lto_disallowed)]
+pub(crate) struct LtoDisallowed;
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_lto_dylib)]
+pub(crate) struct LtoDylib;
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_lto_bitcode_from_rlib)]
+pub(crate) struct LtoBitcodeFromRlib {
+    pub gcc_err: String,
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7b55a4e4082..2de8fb3fc70 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,12 @@
  * TODO(antoyo): implement equality in libgccjit based on https://zpz.github.io/blog/overloading-equality-operator-in-cpp-class-hierarchy/ (for type equality?)
  * TODO(antoyo): support #[inline] attributes.
  * TODO(antoyo): support LTO (gcc's equivalent to Full LTO is -flto -flto-partition=one — https://documentation.suse.com/sbp/all/html/SBP-GCC-10/index.html).
+ * For Thin LTO, this might be helpful:
+ * In gcc 4.6 -fwhopr was removed and became default with -flto. The non-whopr path can still be executed via -flto-partition=none.
+ *
+ * Maybe some missing optizations enabled by rustc's LTO is in there: https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
+ * Like -fipa-icf (should be already enabled) and maybe -fdevirtualize-at-ltrans.
+ * TODO: disable debug info always being emitted. Perhaps this slows down things?
  *
  * TODO(antoyo): remove the patches.
  */
@@ -28,6 +34,7 @@ extern crate rustc_codegen_ssa;
 extern crate rustc_data_structures;
 extern crate rustc_errors;
 extern crate rustc_fluent_macro;
+extern crate rustc_fs_util;
 extern crate rustc_hir;
 extern crate rustc_macros;
 extern crate rustc_metadata;
@@ -35,6 +42,8 @@ extern crate rustc_middle;
 extern crate rustc_session;
 extern crate rustc_span;
 extern crate rustc_target;
+#[macro_use]
+extern crate tracing;
 
 // This prevents duplicating functions and statics that are already part of the host rustc process.
 #[allow(unused_extern_crates)]
@@ -65,22 +74,24 @@ mod type_of;
 use std::any::Any;
 use std::sync::Arc;
 #[cfg(not(feature="master"))]
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::AtomicBool;
+#[cfg(not(feature="master"))]
+use std::sync::atomic::Ordering;
 
-use crate::errors::LTONotSupported;
 use gccjit::{Context, OptimizationLevel};
 #[cfg(feature="master")]
 use gccjit::TargetInfo;
 #[cfg(not(feature="master"))]
 use gccjit::CType;
+use errors::LTONotSupported;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen};
 use rustc_codegen_ssa::base::codegen_crate;
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryFn};
 use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
 use rustc_codegen_ssa::target_features::supported_target_features;
-use rustc_codegen_ssa::traits::{CodegenBackend, ExtraBackendMethods, ModuleBufferMethods, ThinBufferMethods, WriteBackendMethods};
 use rustc_data_structures::fx::FxIndexMap;
+use rustc_codegen_ssa::traits::{CodegenBackend, ExtraBackendMethods, ThinBufferMethods, WriteBackendMethods};
 use rustc_errors::{DiagnosticMessage, ErrorGuaranteed, Handler, SubdiagnosticMessage};
 use rustc_fluent_macro::fluent_messages;
 use rustc_metadata::EncodedMetadata;
@@ -91,9 +102,10 @@ use rustc_session::config::{Lto, OptLevel, OutputFilenames};
 use rustc_session::Session;
 use rustc_span::Symbol;
 use rustc_span::fatal_error::FatalError;
-#[cfg(not(feature="master"))]
 use tempfile::TempDir;
 
+use crate::back::lto::ModuleBuffer;
+
 fluent_messages! { "../messages.ftl" }
 
 pub struct PrintOnPanic<F: Fn() -> String>(pub F);
@@ -136,7 +148,7 @@ impl CodegenBackend for GccCodegenBackend {
     fn init(&self, sess: &Session) {
         #[cfg(feature="master")]
         gccjit::set_global_personality_function_name(b"rust_eh_personality\0");
-        if sess.lto() != Lto::No {
+        if sess.lto() == Lto::Thin {
             sess.emit_warning(LTONotSupported {});
         }
 
@@ -194,7 +206,12 @@ impl ExtraBackendMethods for GccCodegenBackend {
     fn codegen_allocator<'tcx>(&self, tcx: TyCtxt<'tcx>, module_name: &str, kind: AllocatorKind, alloc_error_handler_kind: AllocatorKind) -> Self::Module {
         let mut mods = GccContext {
             context: Context::default(),
+            should_combine_object_files: false,
+            temp_dir: None,
         };
+
+        // TODO(antoyo): only set for x86.
+        mods.context.add_command_line_option("-masm=intel");
         unsafe { allocator::codegen(tcx, &mut mods, module_name, kind, alloc_error_handler_kind); }
         mods
     }
@@ -211,14 +228,6 @@ impl ExtraBackendMethods for GccCodegenBackend {
     }
 }
 
-pub struct ModuleBuffer;
-
-impl ModuleBufferMethods for ModuleBuffer {
-    fn data(&self) -> &[u8] {
-        unimplemented!();
-    }
-}
-
 pub struct ThinBuffer;
 
 impl ThinBufferMethods for ThinBuffer {
@@ -229,6 +238,9 @@ impl ThinBufferMethods for ThinBuffer {
 
 pub struct GccContext {
     context: Context<'static>,
+    should_combine_object_files: bool,
+    // Temporary directory used by LTO. We keep it here so that it's not removed before linking.
+    temp_dir: Option<TempDir>,
 }
 
 unsafe impl Send for GccContext {}
@@ -243,18 +255,8 @@ impl WriteBackendMethods for GccCodegenBackend {
     type ThinData = ();
     type ThinBuffer = ThinBuffer;
 
-    fn run_fat_lto(_cgcx: &CodegenContext<Self>, mut modules: Vec<FatLtoInput<Self>>, _cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>) -> Result<LtoModuleCodegen<Self>, FatalError> {
-        // TODO(antoyo): implement LTO by sending -flto to libgccjit and adding the appropriate gcc linker plugins.
-        // NOTE: implemented elsewhere.
-        // TODO(antoyo): what is implemented elsewhere ^ ?
-        let module =
-            match modules.remove(0) {
-                FatLtoInput::InMemory(module) => module,
-                FatLtoInput::Serialized { .. } => {
-                    unimplemented!();
-                }
-            };
-        Ok(LtoModuleCodegen::Fat { module, _serialized_bitcode: vec![] })
+    fn run_fat_lto(cgcx: &CodegenContext<Self>, modules: Vec<FatLtoInput<Self>>, cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>) -> Result<LtoModuleCodegen<Self>, FatalError> {
+        back::lto::run_fat(cgcx, modules, cached_modules)
     }
 
     fn run_thin_lto(_cgcx: &CodegenContext<Self>, _modules: Vec<(String, Self::ThinBuffer)>, _cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>) -> Result<(Vec<LtoModuleCodegen<Self>>, Vec<WorkProduct>), FatalError> {
diff --git a/test.sh b/test.sh
index 1054fdf7ea1..c47cf140ae4 100755
--- a/test.sh
+++ b/test.sh
@@ -3,6 +3,7 @@
 # TODO(antoyo): rewrite to cargo-make (or just) or something like that to only rebuild the sysroot when needed?
 
 set -e
+#set -x
 
 if [ -f ./gcc_path ]; then
     export GCC_PATH=$(cat gcc_path)
@@ -345,14 +346,13 @@ function test_rustc() {
 
     git checkout -- tests/ui/issues/auxiliary/issue-3136-a.rs # contains //~ERROR, but shouldn't be removed
 
-    rm -r tests/ui/{abi*,extern/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,borrowck/,chalkify/bugs/,test*,*lto*.rs,consts/const-float-bits-reject-conv.rs,consts/issue-miri-1910.rs} || true
+    rm -r tests/ui/{abi*,extern/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,borrowck/,chalkify/bugs/,test*,consts/const-float-bits-reject-conv.rs,consts/issue-miri-1910.rs} || true
     rm tests/ui/mir/mir_heavy_promoted.rs # this test is oom-killed in the CI.
     # Tests generating errors.
     rm tests/ui/consts/const-eval/nonnull_as_ref_ub.rs tests/ui/consts/issue-94675.rs
-    for test in $(rg --files-with-matches "thread|lto" tests/ui); do
+    for test in $(rg --files-with-matches "thread" tests/ui); do
       rm $test
     done
-    git checkout tests/ui/lto/auxiliary/dylib.rs
     git checkout tests/ui/type-alias-impl-trait/auxiliary/cross_crate_ice.rs
     git checkout tests/ui/type-alias-impl-trait/auxiliary/cross_crate_ice2.rs
     git checkout tests/ui/macros/rfc-2011-nicer-assert-messages/auxiliary/common.rs