about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorAlex Crichton <alex@alexcrichton.com>2016-12-29 17:26:15 -0800
committerGitHub <noreply@github.com>2016-12-29 17:26:15 -0800
commitbcfd50474450cdf6b858d4f033f201264370a730 (patch)
tree3f03699f7e6470eda62cb0e8560ab15bbc14690c /src
parent4e2e01ec9570997b9a0588f691d85dd081f83fd9 (diff)
parentaac5ff76649a2257e2c04f1d44cf11e999a39442 (diff)
downloadrust-bcfd50474450cdf6b858d4f033f201264370a730.tar.gz
rust-bcfd50474450cdf6b858d4f033f201264370a730.zip
Rollup merge of #38559 - japaric:ptx2, r=alexcrichton
PTX support, take 2

- You can generate PTX using `--emit=asm` and the right (custom) target. Which
  then you can run on a NVIDIA GPU.

- You can compile `core` to PTX. [Xargo] also works and it can compile some
  other crates like `collections` (but I doubt all of those make sense on a GPU)

[Xargo]: https://github.com/japaric/xargo

- You can create "global" functions, which can be "called" by the host, using
  the `"ptx-kernel"` ABI, e.g. `extern "ptx-kernel" fn kernel() { .. }`. Every
  other function is a "device" function and can only be called by the GPU.

- Intrinsics like `__syncthreads()` and `blockIdx.x` are available as
  `"platform-intrinsics"`. These intrinsics are *not* in the `core` crate but
  any Rust user can create "bindings" to them using an `extern
  "platform-intrinsics"` block. See example at the end.

- Trying to emit PTX with `-g` (debuginfo); you get an LLVM error. But I don't
  think PTX can contain debuginfo anyway so `-g` should be ignored and a warning
  should be printed ("`-g` doesn't work with this target" or something).

- "Single source" support. You *can't* write a single source file that contains
  both host and device code. I think that should be possible to implement that
  outside the compiler using compiler plugins / build scripts.

- The equivalent to CUDA `__shared__` which it's used to declare memory that's
  shared between the threads of the same block. This could be implemented using
  attributes: `#[shared] static mut SCRATCH_MEMORY: [f32; 64]` but hasn't been
  implemented yet.

- Built-in targets. This PR doesn't add targets to the compiler just yet but one
  can create custom targets to be able to emit PTX code (see the example at the
  end). The idea is to have people experiment with this feature before
  committing to it (built-in targets are "insta-stable")

- All functions must be "inlined". IOW, the `.rlib` must always contain the LLVM
  bitcode of all the functions of the crate it was produced from. Otherwise, you
  end with "undefined references" in the final PTX code but you won't get *any*
  linker error because no linker is involved. IOW, you'll hit a runtime error
  when loading the PTX into the GPU. The workaround is to use `#[inline]` on
  non-generic functions and to never use `#[inline(never)]` but this may not
  always be possible because e.g. you could be relying on third party code.

- Should `--emit=asm` generate a `.ptx` file instead of a `.s` file?

TL;DR Use Xargo to turn a crate into a PTX module (a `.s` file). Then pass that
PTX module, as a string, to the GPU and run it.

The full code is in [this repository]. This section gives an overview of how to
run Rust code on a NVIDIA GPU.

[this repository]: https://github.com/japaric/cuda

- Create a custom target. Here's the 64-bit NVPTX target (NOTE: the comments
  are not valid because this is supposed to be a JSON file; remove them before
  you use this file):

``` js
// nvptx64-nvidia-cuda.json
{
  "arch": "nvptx64",  // matches LLVM
  "cpu": "sm_20",  // "oldest" compute capability supported by LLVM
  "data-layout": "e-i64:64-v16:16-v32:32-n16:32:64",
  "llvm-target": "nvptx64-nvidia-cuda",
  "max-atomic-width": 0,  // LLVM errors with any other value :-(
  "os": "cuda",  // matches LLVM
  "panic-strategy": "abort",
  "target-endian": "little",
  "target-pointer-width": "64",
  "target-vendor": "nvidia",  // matches LLVM -- not required
}
```

(There's a 32-bit target specification in the linked repository)

- Write a kernel

``` rust

extern "platform-intrinsic" {
    fn nvptx_block_dim_x() -> i32;
    fn nvptx_block_idx_x() -> i32;
    fn nvptx_thread_idx_x() -> i32;
}

/// Copies an array of `n` floating point numbers from `src` to `dst`
pub unsafe extern "ptx-kernel" fn memcpy(dst: *mut f32,
                                         src: *const f32,
                                         n: usize) {
    let i = (nvptx_block_dim_x() as isize)
        .wrapping_mul(nvptx_block_idx_x() as isize)
        .wrapping_add(nvptx_thread_idx_x() as isize);

    if (i as usize) < n {
        *dst.offset(i) = *src.offset(i);
    }
}
```

- Emit PTX code

```
$ xargo rustc --target nvptx64-nvidia-cuda --release -- --emit=asm
   Compiling core v0.0.0 (file://..)
   (..)
   Compiling nvptx-builtins v0.1.0 (https://github.com/japaric/nvptx-builtins)
   Compiling kernel v0.1.0

$ cat target/nvptx64-nvidia-cuda/release/deps/kernel-*.s
//
// Generated by LLVM NVPTX Back-End
//

.version 3.2
.target sm_20
.address_size 64

        // .globl       memcpy

.visible .entry memcpy(
        .param .u64 memcpy_param_0,
        .param .u64 memcpy_param_1,
        .param .u64 memcpy_param_2
)
{
        .reg .pred      %p<2>;
        .reg .s32       %r<5>;
        .reg .s64       %rd<12>;

        ld.param.u64    %rd7, [memcpy_param_2];
        mov.u32 %r1, %ntid.x;
        mov.u32 %r2, %ctaid.x;
        mul.wide.s32    %rd8, %r2, %r1;
        mov.u32 %r3, %tid.x;
        cvt.s64.s32     %rd9, %r3;
        add.s64         %rd10, %rd9, %rd8;
        setp.ge.u64     %p1, %rd10, %rd7;
        @%p1 bra        LBB0_2;
        ld.param.u64    %rd3, [memcpy_param_0];
        ld.param.u64    %rd4, [memcpy_param_1];
        cvta.to.global.u64      %rd5, %rd4;
        cvta.to.global.u64      %rd6, %rd3;
        shl.b64         %rd11, %rd10, 2;
        add.s64         %rd1, %rd6, %rd11;
        add.s64         %rd2, %rd5, %rd11;
        ld.global.u32   %r4, [%rd2];
        st.global.u32   [%rd1], %r4;
LBB0_2:
        ret;
}
```

- Run it on the GPU

``` rust
// `kernel.ptx` is the `*.s` file we got in the previous step
const KERNEL: &'static str = include_str!("kernel.ptx");

driver::initialize()?;

let device = Device(0)?;
let ctx = device.create_context()?;
let module = ctx.load_module(KERNEL)?;
let kernel = module.function("memcpy")?;

let h_a: Vec<f32> = /* create some random data */;
let h_b = vec![0.; N];

let d_a = driver::allocate(bytes)?;
let d_b = driver::allocate(bytes)?;

// Copy from host to GPU
driver::copy(h_a, d_a)?;

// Run `memcpy` on the GPU
kernel.launch(d_b, d_a, N)?;

// Copy from GPU to host
driver::copy(d_b, h_b)?;

// Verify
assert_eq!(h_a, h_b);

// `d_a`, `d_b`, `h_a`, `h_b` are dropped/freed here
```

---

cc @alexcrichton @brson @rkruppe

> What has changed since #34195?

- `core` now can be compiled into PTX. Which makes it very easy to turn `no_std`
  crates into "kernels" with the help of Xargo.

- There's now a way, the `"ptx-kernel"` ABI, to generate "global" functions. The
  old PR required a manual step (it was hack) to "convert" "device" functions
  into "global" functions. (Only "global" functions can be launched by the host)

- Everything is unstable. There are not "insta stable" built-in targets this
  time (\*). The users have to use a custom target to experiment with this
  feature. Also, PTX instrinsics, like `__syncthreads` and `blockIdx.x`, are now
  implemented as `"platform-intrinsics"` so they no longer live in the `core`
  crate.

(\*) I'd actually like to have in-tree targets because that makes this target
more discoverable, removes the need to lug around .json files, etc.

However, bundling a target with the compiler immediately puts it in the path
towards stabilization. Which gives us just two cycles to find and fix any
problem with the target specification. Afterwards, it becomes hard to tweak
the specification because that could be a breaking change.

A possible solution could be "unstable built-in targets". Basically, to use an
unstable target, you'll have to also pass `-Z unstable-options` to the compiler.
And unstable targets, being unstable, wouldn't be available on stable.

> Why should this be merged?

- To let people experiment with the feature out of tree. Having easy access to
  the feature (in every nightly) allows this. I also think that, as it is, it
  should be possible to start prototyping type-safe single source support using
  build scripts, macros and/or plugins.

- It's a straightforward implementation. No different that adding support for
  any other architecture.
Diffstat (limited to 'src')
-rw-r--r--src/bootstrap/native.rs2
-rw-r--r--src/etc/platform-intrinsics/nvptx/cuda.json13
-rw-r--r--src/etc/platform-intrinsics/nvptx/info.json7
-rw-r--r--src/etc/platform-intrinsics/nvptx/sreg.json90
-rw-r--r--src/librustc_llvm/build.rs2
-rw-r--r--src/librustc_llvm/ffi.rs1
-rw-r--r--src/librustc_llvm/lib.rs5
-rw-r--r--src/librustc_platform_intrinsics/lib.rs3
-rw-r--r--src/librustc_platform_intrinsics/nvptx.rs92
-rw-r--r--src/librustc_trans/abi.rs5
-rw-r--r--src/librustc_trans/cabi_nvptx.rs53
-rw-r--r--src/librustc_trans/cabi_nvptx64.rs53
-rw-r--r--src/librustc_trans/lib.rs2
-rw-r--r--src/libsyntax/abi.rs2
-rw-r--r--src/libsyntax/feature_gate.rs17
-rw-r--r--src/test/ui/codemap_tests/unicode.stderr2
16 files changed, 345 insertions, 4 deletions
diff --git a/src/bootstrap/native.rs b/src/bootstrap/native.rs
index 09dbd9f8220..80f27a5ab67 100644
--- a/src/bootstrap/native.rs
+++ b/src/bootstrap/native.rs
@@ -81,7 +81,7 @@ pub fn llvm(build: &Build, target: &str) {
        .profile(profile)
        .define("LLVM_ENABLE_ASSERTIONS", assertions)
        .define("LLVM_TARGETS_TO_BUILD",
-               "X86;ARM;AArch64;Mips;PowerPC;SystemZ;JSBackend;MSP430;Sparc")
+               "X86;ARM;AArch64;Mips;PowerPC;SystemZ;JSBackend;MSP430;Sparc;NVPTX")
        .define("LLVM_INCLUDE_EXAMPLES", "OFF")
        .define("LLVM_INCLUDE_TESTS", "OFF")
        .define("LLVM_INCLUDE_DOCS", "OFF")
diff --git a/src/etc/platform-intrinsics/nvptx/cuda.json b/src/etc/platform-intrinsics/nvptx/cuda.json
new file mode 100644
index 00000000000..1beaaeb5d87
--- /dev/null
+++ b/src/etc/platform-intrinsics/nvptx/cuda.json
@@ -0,0 +1,13 @@
+{
+    "intrinsic_prefix": "_",
+    "llvm_prefix": "llvm.cuda.",
+    "intrinsics": [
+        {
+            "intrinsic": "syncthreads",
+            "width": ["0"],
+            "llvm": "syncthreads",
+            "ret": "V",
+            "args": []
+        }
+    ]
+}
diff --git a/src/etc/platform-intrinsics/nvptx/info.json b/src/etc/platform-intrinsics/nvptx/info.json
new file mode 100644
index 00000000000..80332c54e04
--- /dev/null
+++ b/src/etc/platform-intrinsics/nvptx/info.json
@@ -0,0 +1,7 @@
+{
+  "platform": "nvptx",
+  "number_info": {
+    "signed": {}
+  },
+  "width_info": {}
+}
diff --git a/src/etc/platform-intrinsics/nvptx/sreg.json b/src/etc/platform-intrinsics/nvptx/sreg.json
new file mode 100644
index 00000000000..33d97f26946
--- /dev/null
+++ b/src/etc/platform-intrinsics/nvptx/sreg.json
@@ -0,0 +1,90 @@
+{
+    "intrinsic_prefix": "_",
+    "llvm_prefix": "llvm.nvvm.read.ptx.sreg.",
+    "intrinsics": [
+        {
+            "intrinsic": "block_dim_x",
+            "width": ["0"],
+            "llvm": "ntid.x",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "block_dim_y",
+            "width": ["0"],
+            "llvm": "ntid.y",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "block_dim_z",
+            "width": ["0"],
+            "llvm": "ntid.z",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "block_idx_x",
+            "width": ["0"],
+            "llvm": "ctaid.x",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "block_idx_y",
+            "width": ["0"],
+            "llvm": "ctaid.y",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "block_idx_z",
+            "width": ["0"],
+            "llvm": "ctaid.z",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "grid_dim_x",
+            "width": ["0"],
+            "llvm": "nctaid.x",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "grid_dim_y",
+            "width": ["0"],
+            "llvm": "nctaid.y",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "grid_dim_z",
+            "width": ["0"],
+            "llvm": "nctaid.z",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "thread_idx_x",
+            "width": ["0"],
+            "llvm": "tid.x",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "thread_idx_y",
+            "width": ["0"],
+            "llvm": "tid.y",
+            "ret": "S32",
+            "args": []
+        },
+        {
+            "intrinsic": "thread_idx_z",
+            "width": ["0"],
+            "llvm": "tid.z",
+            "ret": "S32",
+            "args": []
+        }
+    ]
+}
diff --git a/src/librustc_llvm/build.rs b/src/librustc_llvm/build.rs
index e681a81cf0c..2ee4cc49435 100644
--- a/src/librustc_llvm/build.rs
+++ b/src/librustc_llvm/build.rs
@@ -96,7 +96,7 @@ fn main() {
 
     let optional_components =
         ["x86", "arm", "aarch64", "mips", "powerpc", "pnacl", "systemz", "jsbackend", "msp430",
-         "sparc"];
+         "sparc", "nvptx"];
 
     // FIXME: surely we don't need all these components, right? Stuff like mcjit
     //        or interpreter the compiler itself never uses.
diff --git a/src/librustc_llvm/ffi.rs b/src/librustc_llvm/ffi.rs
index 5fd85023e41..c1705815165 100644
--- a/src/librustc_llvm/ffi.rs
+++ b/src/librustc_llvm/ffi.rs
@@ -42,6 +42,7 @@ pub enum CallConv {
     X86StdcallCallConv = 64,
     X86FastcallCallConv = 65,
     ArmAapcsCallConv = 67,
+    PtxKernel = 71,
     X86_64_SysV = 78,
     X86_64_Win64 = 79,
     X86_VectorCall = 80,
diff --git a/src/librustc_llvm/lib.rs b/src/librustc_llvm/lib.rs
index c8b1ea50f97..1e45ea083c9 100644
--- a/src/librustc_llvm/lib.rs
+++ b/src/librustc_llvm/lib.rs
@@ -376,6 +376,11 @@ pub fn initialize_available_targets() {
                  LLVMInitializeSparcTargetMC,
                  LLVMInitializeSparcAsmPrinter,
                  LLVMInitializeSparcAsmParser);
+    init_target!(llvm_component = "nvptx",
+                 LLVMInitializeNVPTXTargetInfo,
+                 LLVMInitializeNVPTXTarget,
+                 LLVMInitializeNVPTXTargetMC,
+                 LLVMInitializeNVPTXAsmPrinter);
 }
 
 pub fn last_error() -> Option<String> {
diff --git a/src/librustc_platform_intrinsics/lib.rs b/src/librustc_platform_intrinsics/lib.rs
index 6fe1f0c2b9c..e814050e960 100644
--- a/src/librustc_platform_intrinsics/lib.rs
+++ b/src/librustc_platform_intrinsics/lib.rs
@@ -95,6 +95,7 @@ static VOID: Type = Type::Void;
 mod x86;
 mod arm;
 mod aarch64;
+mod nvptx;
 
 impl Intrinsic {
     pub fn find(name: &str) -> Option<Intrinsic> {
@@ -104,6 +105,8 @@ impl Intrinsic {
             arm::find(name)
         } else if name.starts_with("aarch64_") {
             aarch64::find(name)
+        } else if name.starts_with("nvptx_") {
+            nvptx::find(name)
         } else {
             None
         }
diff --git a/src/librustc_platform_intrinsics/nvptx.rs b/src/librustc_platform_intrinsics/nvptx.rs
new file mode 100644
index 00000000000..82408723ebe
--- /dev/null
+++ b/src/librustc_platform_intrinsics/nvptx.rs
@@ -0,0 +1,92 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// DO NOT EDIT: autogenerated by etc/platform-intrinsics/generator.py
+// ignore-tidy-linelength
+
+#![allow(unused_imports)]
+
+use {Intrinsic, Type};
+use IntrinsicDef::Named;
+
+// The default inlining settings trigger a pathological behaviour in
+// LLVM, which causes makes compilation very slow. See #28273.
+#[inline(never)]
+pub fn find(name: &str) -> Option<Intrinsic> {
+    if !name.starts_with("nvptx") { return None }
+    Some(match &name["nvptx".len()..] {
+        "_syncthreads" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::VOID,
+            definition: Named("llvm.cuda.syncthreads")
+        },
+        "_block_dim_x" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ntid.x")
+        },
+        "_block_dim_y" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ntid.y")
+        },
+        "_block_dim_z" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ntid.z")
+        },
+        "_block_idx_x" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ctaid.x")
+        },
+        "_block_idx_y" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ctaid.y")
+        },
+        "_block_idx_z" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.ctaid.z")
+        },
+        "_grid_dim_x" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.nctaid.x")
+        },
+        "_grid_dim_y" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.nctaid.y")
+        },
+        "_grid_dim_z" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.nctaid.z")
+        },
+        "_thread_idx_x" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.tid.x")
+        },
+        "_thread_idx_y" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.tid.y")
+        },
+        "_thread_idx_z" => Intrinsic {
+            inputs: { static INPUTS: [&'static Type; 0] = []; &INPUTS },
+            output: &::I32,
+            definition: Named("llvm.nvvm.read.ptx.sreg.tid.z")
+        },
+        _ => return None,
+    })
+}
diff --git a/src/librustc_trans/abi.rs b/src/librustc_trans/abi.rs
index 9c4246e079b..81e4b4d1f21 100644
--- a/src/librustc_trans/abi.rs
+++ b/src/librustc_trans/abi.rs
@@ -25,6 +25,8 @@ use cabi_mips64;
 use cabi_asmjs;
 use cabi_msp430;
 use cabi_sparc;
+use cabi_nvptx;
+use cabi_nvptx64;
 use machine::{llalign_of_min, llsize_of, llsize_of_alloc};
 use type_::Type;
 use type_of;
@@ -353,6 +355,7 @@ impl FnType {
             Win64 => llvm::X86_64_Win64,
             SysV64 => llvm::X86_64_SysV,
             Aapcs => llvm::ArmAapcsCallConv,
+            PtxKernel => llvm::PtxKernel,
 
             // These API constants ought to be more specific...
             Cdecl => llvm::CCallConv,
@@ -608,6 +611,8 @@ impl FnType {
             "wasm32" => cabi_asmjs::compute_abi_info(ccx, self),
             "msp430" => cabi_msp430::compute_abi_info(ccx, self),
             "sparc" => cabi_sparc::compute_abi_info(ccx, self),
+            "nvptx" => cabi_nvptx::compute_abi_info(ccx, self),
+            "nvptx64" => cabi_nvptx64::compute_abi_info(ccx, self),
             a => ccx.sess().fatal(&format!("unrecognized arch \"{}\" in target specification", a))
         }
 
diff --git a/src/librustc_trans/cabi_nvptx.rs b/src/librustc_trans/cabi_nvptx.rs
new file mode 100644
index 00000000000..5ece19f764a
--- /dev/null
+++ b/src/librustc_trans/cabi_nvptx.rs
@@ -0,0 +1,53 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// Reference: PTX Writer's Guide to Interoperability
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability
+
+#![allow(non_upper_case_globals)]
+
+use llvm::Struct;
+
+use abi::{self, ArgType, FnType};
+use context::CrateContext;
+use type_::Type;
+
+fn ty_size(ty: Type) -> usize {
+    abi::ty_size(ty, 4)
+}
+
+fn classify_ret_ty(ccx: &CrateContext, ret: &mut ArgType) {
+    if ret.ty.kind() == Struct && ty_size(ret.ty) > 32 {
+        ret.make_indirect(ccx);
+    } else {
+        ret.extend_integer_width_to(32);
+    }
+}
+
+fn classify_arg_ty(ccx: &CrateContext, arg: &mut ArgType) {
+    if arg.ty.kind() == Struct && ty_size(arg.ty) > 32 {
+        arg.make_indirect(ccx);
+    } else {
+        arg.extend_integer_width_to(32);
+    }
+}
+
+pub fn compute_abi_info(ccx: &CrateContext, fty: &mut FnType) {
+    if !fty.ret.is_ignore() {
+        classify_ret_ty(ccx, &mut fty.ret);
+    }
+
+    for arg in &mut fty.args {
+        if arg.is_ignore() {
+            continue;
+        }
+        classify_arg_ty(ccx, arg);
+    }
+}
diff --git a/src/librustc_trans/cabi_nvptx64.rs b/src/librustc_trans/cabi_nvptx64.rs
new file mode 100644
index 00000000000..880c6cfd7a8
--- /dev/null
+++ b/src/librustc_trans/cabi_nvptx64.rs
@@ -0,0 +1,53 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// Reference: PTX Writer's Guide to Interoperability
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability
+
+#![allow(non_upper_case_globals)]
+
+use llvm::Struct;
+
+use abi::{self, ArgType, FnType};
+use context::CrateContext;
+use type_::Type;
+
+fn ty_size(ty: Type) -> usize {
+    abi::ty_size(ty, 8)
+}
+
+fn classify_ret_ty(ccx: &CrateContext, ret: &mut ArgType) {
+    if ret.ty.kind() == Struct && ty_size(ret.ty) > 64 {
+        ret.make_indirect(ccx);
+    } else {
+        ret.extend_integer_width_to(64);
+    }
+}
+
+fn classify_arg_ty(ccx: &CrateContext, arg: &mut ArgType) {
+    if arg.ty.kind() == Struct && ty_size(arg.ty) > 64 {
+        arg.make_indirect(ccx);
+    } else {
+        arg.extend_integer_width_to(64);
+    }
+}
+
+pub fn compute_abi_info(ccx: &CrateContext, fty: &mut FnType) {
+    if !fty.ret.is_ignore() {
+        classify_ret_ty(ccx, &mut fty.ret);
+    }
+
+    for arg in &mut fty.args {
+        if arg.is_ignore() {
+            continue;
+        }
+        classify_arg_ty(ccx, arg);
+    }
+}
diff --git a/src/librustc_trans/lib.rs b/src/librustc_trans/lib.rs
index 3a8eef131a2..0d3e1853f01 100644
--- a/src/librustc_trans/lib.rs
+++ b/src/librustc_trans/lib.rs
@@ -103,6 +103,8 @@ mod cabi_asmjs;
 mod cabi_mips;
 mod cabi_mips64;
 mod cabi_msp430;
+mod cabi_nvptx;
+mod cabi_nvptx64;
 mod cabi_powerpc;
 mod cabi_powerpc64;
 mod cabi_s390x;
diff --git a/src/libsyntax/abi.rs b/src/libsyntax/abi.rs
index a39cac8db99..0cc62fb43a5 100644
--- a/src/libsyntax/abi.rs
+++ b/src/libsyntax/abi.rs
@@ -41,6 +41,7 @@ pub enum Abi {
     Aapcs,
     Win64,
     SysV64,
+    PtxKernel,
 
     // Multiplatform / generic ABIs
     Rust,
@@ -82,6 +83,7 @@ const AbiDatas: &'static [AbiData] = &[
     AbiData {abi: Abi::Aapcs, name: "aapcs", generic: false },
     AbiData {abi: Abi::Win64, name: "win64", generic: false },
     AbiData {abi: Abi::SysV64, name: "sysv64", generic: false },
+    AbiData {abi: Abi::PtxKernel, name: "ptx-kernel", generic: false },
 
     // Cross-platform ABIs
     AbiData {abi: Abi::Rust, name: "Rust", generic: true },
diff --git a/src/libsyntax/feature_gate.rs b/src/libsyntax/feature_gate.rs
index e04cc11f15e..625af803458 100644
--- a/src/libsyntax/feature_gate.rs
+++ b/src/libsyntax/feature_gate.rs
@@ -318,6 +318,9 @@ declare_features! (
 
     // Allow safe suggestions for potential type conversions.
     (active, safe_suggestion, "1.0.0", Some(37384)),
+
+    // `extern "ptx-*" fn()`
+    (active, abi_ptx, "1.15.0", None),
 );
 
 declare_features! (
@@ -986,7 +989,19 @@ impl<'a> PostExpansionVisitor<'a> {
                 gate_feature_post!(&self, abi_sysv64, span,
                                    "sysv64 ABI is experimental and subject to change");
             },
-            _ => {}
+            Abi::PtxKernel => {
+                gate_feature_post!(&self, abi_ptx, span,
+                                   "PTX ABIs are experimental and subject to change");
+            }
+            // Stable
+            Abi::Cdecl |
+            Abi::Stdcall |
+            Abi::Fastcall |
+            Abi::Aapcs |
+            Abi::Win64 |
+            Abi::Rust |
+            Abi::C |
+            Abi::System => {}
         }
     }
 }
diff --git a/src/test/ui/codemap_tests/unicode.stderr b/src/test/ui/codemap_tests/unicode.stderr
index a748e13ecf1..70fe17888e8 100644
--- a/src/test/ui/codemap_tests/unicode.stderr
+++ b/src/test/ui/codemap_tests/unicode.stderr
@@ -1,4 +1,4 @@
-error: invalid ABI: expected one of [cdecl, stdcall, fastcall, vectorcall, aapcs, win64, sysv64, Rust, C, system, rust-intrinsic, rust-call, platform-intrinsic], found `路濫狼á́́`
+error: invalid ABI: expected one of [cdecl, stdcall, fastcall, vectorcall, aapcs, win64, sysv64, ptx-kernel, Rust, C, system, rust-intrinsic, rust-call, platform-intrinsic], found `路濫狼á́́`
   --> $DIR/unicode.rs:11:8
    |
 11 | extern "路濫狼á́́" fn foo() {}