From a4cf5e9eca00527082fd58356e4adc8775aeee03 Mon Sep 17 00:00:00 2001 From: Chen Zhongyao Date: Mon, 14 Jul 2025 13:35:53 +0000 Subject: Add new Tier-3 target: riscv64a23-unknown-linux-gnu --- src/bootstrap/src/core/sanity.rs | 1 + 1 file changed, 1 insertion(+) (limited to 'src') diff --git a/src/bootstrap/src/core/sanity.rs b/src/bootstrap/src/core/sanity.rs index 3080e641b5b..a2d5d3f0e37 100644 --- a/src/bootstrap/src/core/sanity.rs +++ b/src/bootstrap/src/core/sanity.rs @@ -34,6 +34,7 @@ pub struct Finder { // Targets can be removed from this list once they are present in the stage0 compiler (usually by updating the beta compiler of the bootstrap). const STAGE0_MISSING_TARGETS: &[&str] = &[ "armv7a-vex-v5", + "riscv64a23-unknown-linux-gnu", // just a dummy comment so the list doesn't get onelined ]; -- cgit 1.4.1-3-g733a5 From 988cd14ec1e1d07d8fadc2e76d370e1da8de4f46 Mon Sep 17 00:00:00 2001 From: Chen Zhongyao Date: Mon, 21 Jul 2025 16:31:01 +0000 Subject: Add a disabled builder for riscv64 rva23 emulated tests This will run all tests for `riscv64a23-unknown-linux-gnu` in a QEMU instance. --- .../host-x86_64/disabled/riscv64a23-gnu/Dockerfile | 108 +++++++++++++++++++++ .../disabled/riscv64a23-gnu/linux.config | 51 ++++++++++ src/tools/build-manifest/src/main.rs | 2 + src/tools/remote-test-client/src/main.rs | 29 +++++- 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile create mode 100644 src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config (limited to 'src') diff --git a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile new file mode 100644 index 00000000000..86b0d795687 --- /dev/null +++ b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile @@ -0,0 +1,108 @@ +FROM ubuntu:24.04 + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y && apt-get install -y --no-install-recommends \ + bc \ + bzip2 \ + ca-certificates \ + cmake \ + cpio \ + curl \ + file \ + flex \ + bison \ + g++ \ + g++-riscv64-linux-gnu \ + git \ + libc6-dev \ + libc6-dev-riscv64-cross \ + libssl-dev \ + make \ + ninja-build \ + python3 \ + xz-utils \ + opensbi \ + u-boot-qemu \ + libslirp0 \ + build-essential \ + pkg-config \ + libglib2.0-dev \ + libpixman-1-dev \ + libsdl2-dev \ + libfdt-dev \ + python3 \ + python3-pip + +ENV ARCH=riscv \ + CROSS_COMPILE=riscv64-linux-gnu- + +WORKDIR /build + +# From https://github.com/michaeljclark/busybear-linux/blob/master/conf/linux.config +COPY host-x86_64/riscv64a23-gnu/linux.config /build + +# qemu v10.0.2 fully support +RUN curl https://gitlab.com/qemu-project/qemu/-/archive/v10.0.2/qemu-v10.0.2.tar.bz2 | tar xjf - && \ + cd qemu-v10.0.2 && \ + ./configure --target-list=riscv64-softmmu \ + --enable-sdl --enable-debug --enable-fdt --enable-slirp && \ + make -j && make install + +# use the opensbi fw from apt-get install +RUN cp /usr/lib/riscv64-linux-gnu/opensbi/generic/fw_jump.bin /tmp + +# Compile the kernel that we're going to be emulating with. This is +# basically just done to be compatible with the QEMU target that we're going +# to be using when running tests. +RUN curl https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.6.97.tar.xz | tar xJf - && \ + cp linux.config linux-6.6.97/.config && \ + cd /build/linux-6.6.97 && \ + make olddefconfig && \ + make -j$(nproc) Image && \ + cp arch/riscv/boot/Image /tmp && \ + rm -rf linux-6.6.97 + +# Compile an instance of busybox as this provides a lightweight system and init +# binary which we will boot into. Only trick here is configuring busybox to +# build static binaries. +RUN curl https://www.busybox.net/downloads/busybox-1.37.0.tar.bz2 | tar xjf - && \ + cd busybox-1.37.0 && \ + make defconfig && \ + sed -i 's/# CONFIG_STATIC is not set/CONFIG_STATIC=y/' .config && \ + sed -i 's/^CONFIG_TC=y$/# CONFIG_TC is not set/' .config && \ + sed -i 's/CONFIG_SHA1_HWACCEL=y/# CONFIG_SHA1_HWACCEL is not set/' .config && \ + sed -i 's/CONFIG_SHA256_HWACCEL=y/# CONFIG_SHA256_HWACCEL is not set/' .config && \ + make -j$(nproc) && \ + make install && \ + mv _install /tmp/rootfs && \ + cd /build && \ + rm -rf busybox-1.37.0 + +# Download the ubuntu rootfs, which we'll use as a chroot for all our tests. +WORKDIR /tmp +RUN mkdir rootfs/ubuntu +RUN curl https://cdimage.ubuntu.com/ubuntu-base/releases/24.04/release/ubuntu-base-24.04.2-base-riscv64.tar.gz | \ + tar xzf - -C rootfs/ubuntu && \ + cd rootfs && mkdir proc sys dev etc etc/init.d + +# Copy over our init script, which starts up our test server and also a few other +# misc tasks +COPY scripts/qemu-bare-bones-rcS rootfs/etc/init.d/rcS +RUN chmod +x rootfs/etc/init.d/rcS + +# Helper to quickly fill the entropy pool in the kernel +COPY scripts/qemu-bare-bones-addentropy.c /tmp/addentropy.c +RUN riscv64-linux-gnu-gcc addentropy.c -o rootfs/addentropy -static + +COPY scripts/sccache.sh /scripts/ +RUN sh /scripts/sccache.sh + +# Avoid "fatal: detected dubious ownership in repository at '/checkout'" error +RUN git config --global --add safe.directory "*" + +ENV RUST_CONFIGURE_ARGS \ + --set target.riscv64a23-unknown-linux-gnu.linker=riscv64-linux-gnu-gcc \ + --set target.riscv64a23-unknown-linux-gnu.qemu-rootfs=/tmp/rootfs +ENV SCRIPT python3 ../x.py --stage 2 test --host='' --target riscv64a23-unknown-linux-gnu + +ENV NO_CHANGE_USER=1 diff --git a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config new file mode 100644 index 00000000000..5142664742f --- /dev/null +++ b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config @@ -0,0 +1,51 @@ +CONFIG_DEFAULT_HOSTNAME="busybear" +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_BPF_SYSCALL=y +CONFIG_SMP=y +CONFIG_MODULES=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_NETLINK_DIAG=y +# CONFIG_WIRELESS is not set +CONFIG_PCI=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +# CONFIG_ETHERNET is not set +# CONFIG_WLAN is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_HVC_RISCV_SBI=y +# CONFIG_HW_RANDOM is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_VIRTIO_MMIO=y +CONFIG_SIFIVE_PLIC=y +CONFIG_RAS=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +# CONFIG_CRYPTO_ECHAINIV is not set +# CONFIG_CRYPTO_HW is not set +CONFIG_PRINTK_TIME=y diff --git a/src/tools/build-manifest/src/main.rs b/src/tools/build-manifest/src/main.rs index 0520eff0fa2..4f26c170de0 100644 --- a/src/tools/build-manifest/src/main.rs +++ b/src/tools/build-manifest/src/main.rs @@ -40,6 +40,7 @@ static HOSTS: &[&str] = &[ "powerpc64le-unknown-linux-gnu", "powerpc64le-unknown-linux-musl", "riscv64gc-unknown-linux-gnu", + "riscv64a23-unknown-linux-gnu", "s390x-unknown-linux-gnu", "sparcv9-sun-solaris", "x86_64-apple-darwin", @@ -155,6 +156,7 @@ static TARGETS: &[&str] = &[ "riscv64gc-unknown-none-elf", "riscv64gc-unknown-linux-gnu", "riscv64gc-unknown-linux-musl", + "riscv64a23-unknown-linux-gnu", "s390x-unknown-linux-gnu", "sparc64-unknown-linux-gnu", "sparcv9-sun-solaris", diff --git a/src/tools/remote-test-client/src/main.rs b/src/tools/remote-test-client/src/main.rs index b9741431b50..e5b51722f92 100644 --- a/src/tools/remote-test-client/src/main.rs +++ b/src/tools/remote-test-client/src/main.rs @@ -111,7 +111,9 @@ fn prepare_rootfs(target: &str, rootfs: &Path, server: &Path, rootfs_img: &Path) "arm-unknown-linux-gnueabihf" | "aarch64-unknown-linux-gnu" => { prepare_rootfs_cpio(rootfs, rootfs_img) } - "riscv64gc-unknown-linux-gnu" => prepare_rootfs_ext4(rootfs, rootfs_img), + "riscv64a23-unknown-linux-gnu" | "riscv64gc-unknown-linux-gnu" => { + prepare_rootfs_ext4(rootfs, rootfs_img) + } _ => panic!("{} is not supported", target), } } @@ -234,6 +236,31 @@ fn start_qemu_emulator(target: &str, rootfs: &Path, server: &Path, tmpdir: &Path .arg(&format!("file={},format=raw,id=hd0", &rootfs_img.to_string_lossy())); t!(cmd.spawn()); } + "riscv64a23-unknown-linux-gnu" => { + let mut cmd = Command::new("qemu-system-riscv64"); + cmd.arg("-nographic") + .arg("-machine") + .arg("virt") + .arg("-cpu") + .arg("rva23s64") + .arg("-m") + .arg("1024") + .arg("-bios") + .arg("/tmp/fw_jump.bin") + .arg("-kernel") + .arg("/tmp/Image") + .arg("-append") + .arg("quiet console=ttyS0 root=/dev/vda rw") + .arg("-netdev") + .arg("user,id=net0,hostfwd=tcp::12345-:12345") + .arg("-device") + .arg("virtio-net-device,netdev=net0,mac=00:00:00:00:00:00") + .arg("-device") + .arg("virtio-blk-device,drive=hd0") + .arg("-drive") + .arg(&format!("file={},format=raw,id=hd0,if=none", &rootfs_img.to_string_lossy())); + t!(cmd.spawn()); + } _ => panic!("cannot start emulator for: {}", target), } } -- cgit 1.4.1-3-g733a5 From 31ae666f0f57a5fefd924a9945e5aca816c001f8 Mon Sep 17 00:00:00 2001 From: Zhongyao Chen Date: Tue, 29 Jul 2025 21:20:03 +0000 Subject: Add target page for riscv64a23-unknown-linux-gnu --- src/doc/rustc/src/SUMMARY.md | 1 + src/doc/rustc/src/platform-support.md | 1 + .../riscv64a23-unknown-linux-gnu.md | 66 ++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md (limited to 'src') diff --git a/src/doc/rustc/src/SUMMARY.md b/src/doc/rustc/src/SUMMARY.md index 25f154f1180..79e229595c8 100644 --- a/src/doc/rustc/src/SUMMARY.md +++ b/src/doc/rustc/src/SUMMARY.md @@ -104,6 +104,7 @@ - [riscv32imac-unknown-xous-elf](platform-support/riscv32imac-unknown-xous-elf.md) - [riscv64gc-unknown-linux-gnu](platform-support/riscv64gc-unknown-linux-gnu.md) - [riscv64gc-unknown-linux-musl](platform-support/riscv64gc-unknown-linux-musl.md) + - [riscv64a23-unknown-linux-gnu](platform-support/riscv64a23-unknown-linux-gnu.md) - [s390x-unknown-linux-gnu](platform-support/s390x-unknown-linux-gnu.md) - [s390x-unknown-linux-musl](platform-support/s390x-unknown-linux-musl.md) - [sparc-unknown-none-elf](./platform-support/sparc-unknown-none-elf.md) diff --git a/src/doc/rustc/src/platform-support.md b/src/doc/rustc/src/platform-support.md index 8ebaa8dd874..cf55d4bd68f 100644 --- a/src/doc/rustc/src/platform-support.md +++ b/src/doc/rustc/src/platform-support.md @@ -389,6 +389,7 @@ target | std | host | notes [`riscv64gc-unknown-nuttx-elf`](platform-support/nuttx.md) | ✓ | | RISC-V 64bit with NuttX [`riscv64gc-unknown-openbsd`](platform-support/openbsd.md) | ✓ | ✓ | OpenBSD/riscv64 [`riscv64imac-unknown-nuttx-elf`](platform-support/nuttx.md) | ✓ | | RISC-V 64bit with NuttX +[`riscv64a23-unknown-linux-gnu`](platform-support/riscv64a23-unknown-linux-gnu.md) | ✓ | ✓ | RISC-V Linux (kernel 4.20+, glibc 2.29) [`s390x-unknown-linux-musl`](platform-support/s390x-unknown-linux-musl.md) | ✓ | | S390x Linux (kernel 3.2, musl 1.2.3) `sparc-unknown-linux-gnu` | ✓ | | 32-bit SPARC Linux [`sparc-unknown-none-elf`](./platform-support/sparc-unknown-none-elf.md) | * | | Bare 32-bit SPARC V7+ diff --git a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md new file mode 100644 index 00000000000..00790ba197f --- /dev/null +++ b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md @@ -0,0 +1,66 @@ +# `riscv64a23-unknown-linux-gnu` + +**Tier: 3** + +RISC-V targets using the ratified [RVA23 Profile](https://github.com/riscv/riscv-profiles/blob/main/rva23-profile.adoc). +This target will enable all mandary features of rva23u64 and rva23s64 by default. + + +## Target maintainers + +[@ZhongyaoChen](https://github.com/ZhongyaoChen) +[@CaiWeiran](https://github.com/CaiWeiran) + +## Requirements + +This target requires: + +* Linux Kernel version 4.20 or later +* glibc 2.17 or later + + +## Building the target + +The target is distributed through `rustup`, and otherwise require no +special configuration. + +If you need to build your own Rust for some reason though, the target can be build with: + +```bash +./x build --target riscv64a23-unknown-linux-gnu +``` + +## Building Rust programs + +Add the target: + +```bash +rustup target add riscv64a23-unknown-linux-gnu +``` + +Then cross compile crates with: + +```bash +cargo build --target riscv64a23-unknown-linux-gnu +``` + +## Cross-compilation toolchains and Testing + +On Ubuntu 24.04, we can install compilation dependencies with: + +```bash +apt install -y git python3 g++ g++-riscv64-linux-gnu +``` + +Then build target with: + +```bash +./x build --target=riscv64a23-unknown-linux-gnu +``` + +There are no special requirements for testing and running the targets. +For testing cross-builds on the host, you can use the docker image. It will automatically set up a RISC-V QEMU emulator and run all the test suite. + +```bash +DEPLOY=1 ./src/ci/docker/run.sh riscv64a23-gnu +``` -- cgit 1.4.1-3-g733a5 From 93a8c1d12cee1af04d84ffc59597dbd877de671f Mon Sep 17 00:00:00 2001 From: zhongyao Date: Fri, 8 Aug 2025 14:42:51 +0800 Subject: Update src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md Co-authored-by: zachs18 <8355914+zachs18@users.noreply.github.com> --- .../riscv64a23-unknown-linux-gnu.md | 36 ++++------------------ 1 file changed, 6 insertions(+), 30 deletions(-) (limited to 'src') diff --git a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md index 00790ba197f..6db4a672936 100644 --- a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md +++ b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md @@ -2,10 +2,9 @@ **Tier: 3** -RISC-V targets using the ratified [RVA23 Profile](https://github.com/riscv/riscv-profiles/blob/main/rva23-profile.adoc). +RISC-V target using the ratified [RVA23 Profile](https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc). This target will enable all mandary features of rva23u64 and rva23s64 by default. - ## Target maintainers [@ZhongyaoChen](https://github.com/ZhongyaoChen) @@ -18,13 +17,11 @@ This target requires: * Linux Kernel version 4.20 or later * glibc 2.17 or later - ## Building the target -The target is distributed through `rustup`, and otherwise require no -special configuration. +Tier-3 target is not distributed through `rustup`. -If you need to build your own Rust for some reason though, the target can be build with: +You need to build your own Rust, the target can be build with: ```bash ./x build --target riscv64a23-unknown-linux-gnu @@ -32,35 +29,14 @@ If you need to build your own Rust for some reason though, the target can be bui ## Building Rust programs -Add the target: +Add the toolchain: ```bash -rustup target add riscv64a23-unknown-linux-gnu +rustup toolchain link rva23-toolchain {path-to-rust}/build/host/stage2 ``` Then cross compile crates with: ```bash -cargo build --target riscv64a23-unknown-linux-gnu -``` - -## Cross-compilation toolchains and Testing - -On Ubuntu 24.04, we can install compilation dependencies with: - -```bash -apt install -y git python3 g++ g++-riscv64-linux-gnu -``` - -Then build target with: - -```bash -./x build --target=riscv64a23-unknown-linux-gnu -``` - -There are no special requirements for testing and running the targets. -For testing cross-builds on the host, you can use the docker image. It will automatically set up a RISC-V QEMU emulator and run all the test suite. - -```bash -DEPLOY=1 ./src/ci/docker/run.sh riscv64a23-gnu +RUSTFLAGS="-C linker=riscv64-linux-gnu-gcc" cargo +rva23-toolchain build --target=riscv64a23-unknown-linux-gnu ``` -- cgit 1.4.1-3-g733a5 From 87fd28998c92a014935d879ae0129b09baa56588 Mon Sep 17 00:00:00 2001 From: Zhongyao Chen Date: Sat, 9 Aug 2025 12:27:38 +0000 Subject: Revert "Add a disabled builder for riscv64 rva23 emulated tests" This reverts commit 289688ca494ddc45bcd0b0706dc173c19fbb7caa. --- .../host-x86_64/disabled/riscv64a23-gnu/Dockerfile | 108 --------------------- .../disabled/riscv64a23-gnu/linux.config | 51 ---------- src/tools/build-manifest/src/main.rs | 2 - src/tools/remote-test-client/src/main.rs | 29 +----- 4 files changed, 1 insertion(+), 189 deletions(-) delete mode 100644 src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile delete mode 100644 src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config (limited to 'src') diff --git a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile deleted file mode 100644 index 86b0d795687..00000000000 --- a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/Dockerfile +++ /dev/null @@ -1,108 +0,0 @@ -FROM ubuntu:24.04 - -ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update -y && apt-get install -y --no-install-recommends \ - bc \ - bzip2 \ - ca-certificates \ - cmake \ - cpio \ - curl \ - file \ - flex \ - bison \ - g++ \ - g++-riscv64-linux-gnu \ - git \ - libc6-dev \ - libc6-dev-riscv64-cross \ - libssl-dev \ - make \ - ninja-build \ - python3 \ - xz-utils \ - opensbi \ - u-boot-qemu \ - libslirp0 \ - build-essential \ - pkg-config \ - libglib2.0-dev \ - libpixman-1-dev \ - libsdl2-dev \ - libfdt-dev \ - python3 \ - python3-pip - -ENV ARCH=riscv \ - CROSS_COMPILE=riscv64-linux-gnu- - -WORKDIR /build - -# From https://github.com/michaeljclark/busybear-linux/blob/master/conf/linux.config -COPY host-x86_64/riscv64a23-gnu/linux.config /build - -# qemu v10.0.2 fully support -RUN curl https://gitlab.com/qemu-project/qemu/-/archive/v10.0.2/qemu-v10.0.2.tar.bz2 | tar xjf - && \ - cd qemu-v10.0.2 && \ - ./configure --target-list=riscv64-softmmu \ - --enable-sdl --enable-debug --enable-fdt --enable-slirp && \ - make -j && make install - -# use the opensbi fw from apt-get install -RUN cp /usr/lib/riscv64-linux-gnu/opensbi/generic/fw_jump.bin /tmp - -# Compile the kernel that we're going to be emulating with. This is -# basically just done to be compatible with the QEMU target that we're going -# to be using when running tests. -RUN curl https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.6.97.tar.xz | tar xJf - && \ - cp linux.config linux-6.6.97/.config && \ - cd /build/linux-6.6.97 && \ - make olddefconfig && \ - make -j$(nproc) Image && \ - cp arch/riscv/boot/Image /tmp && \ - rm -rf linux-6.6.97 - -# Compile an instance of busybox as this provides a lightweight system and init -# binary which we will boot into. Only trick here is configuring busybox to -# build static binaries. -RUN curl https://www.busybox.net/downloads/busybox-1.37.0.tar.bz2 | tar xjf - && \ - cd busybox-1.37.0 && \ - make defconfig && \ - sed -i 's/# CONFIG_STATIC is not set/CONFIG_STATIC=y/' .config && \ - sed -i 's/^CONFIG_TC=y$/# CONFIG_TC is not set/' .config && \ - sed -i 's/CONFIG_SHA1_HWACCEL=y/# CONFIG_SHA1_HWACCEL is not set/' .config && \ - sed -i 's/CONFIG_SHA256_HWACCEL=y/# CONFIG_SHA256_HWACCEL is not set/' .config && \ - make -j$(nproc) && \ - make install && \ - mv _install /tmp/rootfs && \ - cd /build && \ - rm -rf busybox-1.37.0 - -# Download the ubuntu rootfs, which we'll use as a chroot for all our tests. -WORKDIR /tmp -RUN mkdir rootfs/ubuntu -RUN curl https://cdimage.ubuntu.com/ubuntu-base/releases/24.04/release/ubuntu-base-24.04.2-base-riscv64.tar.gz | \ - tar xzf - -C rootfs/ubuntu && \ - cd rootfs && mkdir proc sys dev etc etc/init.d - -# Copy over our init script, which starts up our test server and also a few other -# misc tasks -COPY scripts/qemu-bare-bones-rcS rootfs/etc/init.d/rcS -RUN chmod +x rootfs/etc/init.d/rcS - -# Helper to quickly fill the entropy pool in the kernel -COPY scripts/qemu-bare-bones-addentropy.c /tmp/addentropy.c -RUN riscv64-linux-gnu-gcc addentropy.c -o rootfs/addentropy -static - -COPY scripts/sccache.sh /scripts/ -RUN sh /scripts/sccache.sh - -# Avoid "fatal: detected dubious ownership in repository at '/checkout'" error -RUN git config --global --add safe.directory "*" - -ENV RUST_CONFIGURE_ARGS \ - --set target.riscv64a23-unknown-linux-gnu.linker=riscv64-linux-gnu-gcc \ - --set target.riscv64a23-unknown-linux-gnu.qemu-rootfs=/tmp/rootfs -ENV SCRIPT python3 ../x.py --stage 2 test --host='' --target riscv64a23-unknown-linux-gnu - -ENV NO_CHANGE_USER=1 diff --git a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config b/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config deleted file mode 100644 index 5142664742f..00000000000 --- a/src/ci/docker/host-x86_64/disabled/riscv64a23-gnu/linux.config +++ /dev/null @@ -1,51 +0,0 @@ -CONFIG_DEFAULT_HOSTNAME="busybear" -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_CGROUP_BPF=y -CONFIG_NAMESPACES=y -CONFIG_USER_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_BPF_SYSCALL=y -CONFIG_SMP=y -CONFIG_MODULES=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_INET=y -CONFIG_NETLINK_DIAG=y -# CONFIG_WIRELESS is not set -CONFIG_PCI=y -CONFIG_DEVTMPFS=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_VIRTIO_BLK=y -CONFIG_NETDEVICES=y -CONFIG_VIRTIO_NET=y -# CONFIG_ETHERNET is not set -# CONFIG_WLAN is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_HVC_RISCV_SBI=y -# CONFIG_HW_RANDOM is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_VIRTIO_MMIO=y -CONFIG_SIFIVE_PLIC=y -CONFIG_RAS=y -CONFIG_EXT2_FS=y -CONFIG_EXT3_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_AUTOFS4_FS=y -CONFIG_MSDOS_FS=y -CONFIG_VFAT_FS=y -CONFIG_TMPFS=y -# CONFIG_CRYPTO_ECHAINIV is not set -# CONFIG_CRYPTO_HW is not set -CONFIG_PRINTK_TIME=y diff --git a/src/tools/build-manifest/src/main.rs b/src/tools/build-manifest/src/main.rs index 4f26c170de0..0520eff0fa2 100644 --- a/src/tools/build-manifest/src/main.rs +++ b/src/tools/build-manifest/src/main.rs @@ -40,7 +40,6 @@ static HOSTS: &[&str] = &[ "powerpc64le-unknown-linux-gnu", "powerpc64le-unknown-linux-musl", "riscv64gc-unknown-linux-gnu", - "riscv64a23-unknown-linux-gnu", "s390x-unknown-linux-gnu", "sparcv9-sun-solaris", "x86_64-apple-darwin", @@ -156,7 +155,6 @@ static TARGETS: &[&str] = &[ "riscv64gc-unknown-none-elf", "riscv64gc-unknown-linux-gnu", "riscv64gc-unknown-linux-musl", - "riscv64a23-unknown-linux-gnu", "s390x-unknown-linux-gnu", "sparc64-unknown-linux-gnu", "sparcv9-sun-solaris", diff --git a/src/tools/remote-test-client/src/main.rs b/src/tools/remote-test-client/src/main.rs index e5b51722f92..b9741431b50 100644 --- a/src/tools/remote-test-client/src/main.rs +++ b/src/tools/remote-test-client/src/main.rs @@ -111,9 +111,7 @@ fn prepare_rootfs(target: &str, rootfs: &Path, server: &Path, rootfs_img: &Path) "arm-unknown-linux-gnueabihf" | "aarch64-unknown-linux-gnu" => { prepare_rootfs_cpio(rootfs, rootfs_img) } - "riscv64a23-unknown-linux-gnu" | "riscv64gc-unknown-linux-gnu" => { - prepare_rootfs_ext4(rootfs, rootfs_img) - } + "riscv64gc-unknown-linux-gnu" => prepare_rootfs_ext4(rootfs, rootfs_img), _ => panic!("{} is not supported", target), } } @@ -236,31 +234,6 @@ fn start_qemu_emulator(target: &str, rootfs: &Path, server: &Path, tmpdir: &Path .arg(&format!("file={},format=raw,id=hd0", &rootfs_img.to_string_lossy())); t!(cmd.spawn()); } - "riscv64a23-unknown-linux-gnu" => { - let mut cmd = Command::new("qemu-system-riscv64"); - cmd.arg("-nographic") - .arg("-machine") - .arg("virt") - .arg("-cpu") - .arg("rva23s64") - .arg("-m") - .arg("1024") - .arg("-bios") - .arg("/tmp/fw_jump.bin") - .arg("-kernel") - .arg("/tmp/Image") - .arg("-append") - .arg("quiet console=ttyS0 root=/dev/vda rw") - .arg("-netdev") - .arg("user,id=net0,hostfwd=tcp::12345-:12345") - .arg("-device") - .arg("virtio-net-device,netdev=net0,mac=00:00:00:00:00:00") - .arg("-device") - .arg("virtio-blk-device,drive=hd0") - .arg("-drive") - .arg(&format!("file={},format=raw,id=hd0,if=none", &rootfs_img.to_string_lossy())); - t!(cmd.spawn()); - } _ => panic!("cannot start emulator for: {}", target), } } -- cgit 1.4.1-3-g733a5 From c99224536152cff14639b64a3fa4f7a215fd037c Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 9 Aug 2025 23:52:11 +0100 Subject: refactor: Include table sizes in comment at top of `unicode_data.rs` To make changes in table size obvious from git diffs --- library/core/src/unicode/unicode_data.rs | 10 ++++++++++ src/tools/unicode-table-generator/src/main.rs | 20 +++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9..6059f7d6450 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,4 +1,14 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! +// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist +// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist +// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist +// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist +// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset +// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist +// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset +// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading +// Total : 6530 bytes #[inline(always)] const fn bitset_search< diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bd..c1017142097 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -239,6 +239,11 @@ fn main() { std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); } + let mut table_file = String::new(); + table_file.push_str( + "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", + ); + let mut total_bytes = 0; let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { @@ -252,8 +257,8 @@ fn main() { } modules.push((property.to_lowercase().to_string(), emitter.file)); - println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", + table_file.push_str(&format!( + "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -261,15 +266,10 @@ fn main() { ranges.first().unwrap().start, ranges.last().unwrap().end, emitter.desc, - ); + )); total_bytes += emitter.bytes_used; } - - let mut table_file = String::new(); - - table_file.push_str( - "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", - ); + table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n'); @@ -296,8 +296,6 @@ fn main() { } std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); - - println!("Total table sizes: {total_bytes} bytes"); } fn version() -> String { -- cgit 1.4.1-3-g733a5 From 69e1974bb0bfbcc679d29950b1e4540cd0b9b3ee Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 00:18:04 +0100 Subject: refactor: Include size of case conversion tables Include the sizes of the `to_lowercase` and `to_uppercase` tables in the total size calculations. --- library/core/src/unicode/unicode_data.rs | 12 ++++--- .../unicode-table-generator/src/case_mapping.rs | 41 +++++++++++++++------- src/tools/unicode-table-generator/src/main.rs | 7 +++- 3 files changed, 42 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 6059f7d6450..787efcc0914 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -8,7 +8,9 @@ // N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist // Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset // White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading -// Total : 6530 bytes +// to_lower : 11484 bytes +// to_upper : 13432 bytes +// Total : 31446 bytes #[inline(always)] const fn bitset_search< @@ -782,7 +784,7 @@ pub mod conversions { } } - static LOWERCASE_TABLE: &[(char, u32)] = &[ + static LOWERCASE_TABLE: &[(char, u32); 1434] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), ('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238), @@ -1132,11 +1134,11 @@ pub mod conversions { ('\u{1e921}', 125251), ]; - static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], ]; - static UPPERCASE_TABLE: &[(char, u32)] = &[ + static UPPERCASE_TABLE: &[(char, u32); 1526] = &[ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), ('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204), @@ -1509,7 +1511,7 @@ pub mod conversions { ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], ['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'], diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 9c6454492e7..a8527ea9a42 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,20 +6,22 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); file.push_str("\n\n"); file.push_str(HEADER.trim_start()); file.push('\n'); - file.push_str(&generate_tables("LOWER", &data.to_lower)); + let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower); + file.push_str(&lower_tables); file.push_str("\n\n"); - file.push_str(&generate_tables("UPPER", &data.to_upper)); - file + let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); + file.push_str(&upper_tables); + (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> String { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap) -> String } let mut tables = String::new(); - - write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings)) - .unwrap(); + let mut size = 0; + + size += size_of_val(mappings.as_slice()); + write!( + tables, + "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", + case, + mappings.len(), + fmt_list(mappings), + ) + .unwrap(); tables.push_str("\n\n"); - write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis)) - .unwrap(); - - tables + size += size_of_val(multis.as_slice()); + write!( + tables, + "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", + case, + multis.len(), + fmt_list(multis), + ) + .unwrap(); + + (tables, size) } struct CharEscape(char); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index c1017142097..f755ad048e4 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -269,6 +269,11 @@ fn main() { )); total_bytes += emitter.bytes_used; } + let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); + for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { + table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); + total_bytes += size; + } table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function @@ -280,7 +285,7 @@ fn main() { table_file.push('\n'); - modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { table_file.push_str("#[rustfmt::skip]\n"); -- cgit 1.4.1-3-g733a5 From 5d54ac5276eade9d9424fc4fbfa6f77bcd5d4940 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 01:10:15 +0100 Subject: refactor: rewrite `ranges_from_set` The `merge_ranges` function was very complicated and hard to understand. Forunately, we can use `slice::chunk_by` to achieve the same thing. --- src/tools/unicode-table-generator/src/main.rs | 83 ++++++--------------------- 1 file changed, 17 insertions(+), 66 deletions(-) (limited to 'src') diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index f755ad048e4..bf0511a2c77 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -187,33 +187,19 @@ fn load_data() -> UnicodeData { } } - let mut properties: HashMap<&'static str, Vec>> = properties + let mut properties: Vec<(&'static str, Vec>)> = properties .into_iter() - .map(|(k, v)| { - ( - k, - v.into_iter() - .flat_map(|codepoints| match codepoints { - Codepoints::Single(c) => c - .scalar() - .map(|ch| ch as u32..ch as u32 + 1) - .into_iter() - .collect::>(), - Codepoints::Range(c) => c - .into_iter() - .flat_map(|c| c.scalar().map(|ch| ch as u32..ch as u32 + 1)) - .collect::>(), - }) - .collect::>>(), - ) + .map(|(prop, codepoints)| { + let codepoints = codepoints + .into_iter() + .flatten() + .flat_map(|cp| cp.scalar()) + .map(u32::from) + .collect::>(); + (prop, ranges_from_set(&codepoints)) }) .collect(); - for ranges in properties.values_mut() { - merge_ranges(ranges); - } - - let mut properties = properties.into_iter().collect::>(); properties.sort_by_key(|p| p.0); UnicodeData { ranges: properties, to_lower, to_upper } } @@ -402,48 +388,13 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool } } +/// Group the elements of `set` into contigous ranges fn ranges_from_set(set: &[u32]) -> Vec> { - let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::>>(); - merge_ranges(&mut ranges); - ranges -} - -fn merge_ranges(ranges: &mut Vec>) { - loop { - let mut new_ranges = Vec::new(); - let mut idx_iter = 0..(ranges.len() - 1); - let mut should_insert_last = true; - while let Some(idx) = idx_iter.next() { - let cur = ranges[idx].clone(); - let next = ranges[idx + 1].clone(); - if cur.end == next.start { - if idx_iter.next().is_none() { - // We're merging the last element - should_insert_last = false; - } - new_ranges.push(cur.start..next.end); - } else { - // We're *not* merging the last element - should_insert_last = true; - new_ranges.push(cur); - } - } - if should_insert_last { - new_ranges.push(ranges.last().unwrap().clone()); - } - if new_ranges.len() == ranges.len() { - *ranges = new_ranges; - break; - } else { - *ranges = new_ranges; - } - } - - let mut last_end = None; - for range in ranges { - if let Some(last) = last_end { - assert!(range.start > last, "{range:?}"); - } - last_end = Some(range.end); - } + set.chunk_by(|a, b| a + 1 == *b) + .map(|chunk| { + let start = *chunk.first().unwrap(); + let end = *chunk.last().unwrap(); + start..(end + 1) + }) + .collect() } -- cgit 1.4.1-3-g733a5 From 30d1bc7ba869c0f86bc6d2e1d9ed1ad3b58f7865 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 01:46:00 +0100 Subject: refactor: `generate_tests` Rewrite `generate_tests` to be more idiomatic. --- src/tools/unicode-table-generator/src/main.rs | 97 +++++++++++++-------------- 1 file changed, 45 insertions(+), 52 deletions(-) (limited to 'src') diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index bf0511a2c77..c9530fec48a 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -72,6 +72,8 @@ //! or not. use std::collections::{BTreeMap, HashMap}; +use std::fmt; +use std::fmt::Write; use std::ops::Range; use ucd_parse::Codepoints; @@ -222,7 +224,7 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); + std::fs::write(&path, generate_tests(ranges_by_property).unwrap()).unwrap(); } let mut table_file = String::new(); @@ -326,66 +328,57 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String { +fn generate_tests(ranges: &[(&str, Vec>)]) -> Result { let mut s = String::new(); - s.push_str("#![allow(incomplete_features, unused)]\n"); - s.push_str("#![feature(const_generics)]\n\n"); - s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); - s.push_str(&format!("#[path = \"{data_path}\"]\n")); - s.push_str("mod unicode_data;\n\n"); - - s.push_str("\nfn main() {\n"); - + writeln!(s, "#![feature(core_intrinsics)]")?; + writeln!(s, "#![allow(internal_features, dead_code)]")?; + writeln!(s, "// ignore-tidy-filelength")?; + writeln!(s, "use std::intrinsics;")?; + writeln!(s, "mod unicode_data;")?; + writeln!(s, "fn main() {{")?; for (property, ranges) in ranges { - s.push_str(&format!(r#" println!("Testing {property}");"#)); - s.push('\n'); - s.push_str(&format!(" {}_true();\n", property.to_lowercase())); - s.push_str(&format!(" {}_false();\n", property.to_lowercase())); - let mut is_true = Vec::new(); - let mut is_false = Vec::new(); - for ch_num in 0..(std::char::MAX as u32) { - if std::char::from_u32(ch_num).is_none() { - continue; - } - if ranges.iter().any(|r| r.contains(&ch_num)) { - is_true.push(ch_num); - } else { - is_false.push(ch_num); - } - } - - s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_true, true); - s.push_str(" }\n\n"); - s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_false, false); - s.push_str(" }\n\n"); + let prop = property.to_lowercase(); + writeln!(s, r#" println!("Testing {prop}");"#)?; + writeln!(s, " {prop}_true();")?; + writeln!(s, " {prop}_false();")?; + let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .partition(|c| ranges.iter().any(|r| r.contains(c))); + + writeln!(s, " fn {prop}_true() {{")?; + generate_asserts(&mut s, &prop, &is_true, true)?; + writeln!(s, " }}")?; + + writeln!(s, " fn {prop}_false() {{")?; + generate_asserts(&mut s, &prop, &is_false, false)?; + writeln!(s, " }}")?; } - s.push('}'); - s + writeln!(s, "}}")?; + Ok(s) } -fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { +fn generate_asserts( + s: &mut String, + prop: &str, + points: &[u32], + truthy: bool, +) -> Result<(), fmt::Error> { + let truthy = if truthy { "" } else { "!" }; for range in ranges_from_set(points) { - if range.end == range.start + 1 { - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - std::char::from_u32(range.start).unwrap(), - range.start, - )); - } else { - s.push_str(&format!(" for chn in {range:?}u32 {{\n")); - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - )); - s.push_str(" }\n"); + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + match range.len() { + 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, + _ => { + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; + writeln!(s, " }}")?; + } } } + Ok(()) } /// Group the elements of `set` into contigous ranges -- cgit 1.4.1-3-g733a5 From c3ce0796544152460554d8b8db4e56528fe362db Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 02:13:03 +0100 Subject: refactor: Add tests for case conversions --- .../unicode-table-generator/src/case_mapping.rs | 4 +- src/tools/unicode-table-generator/src/main.rs | 48 ++++++++++++++++++---- 2 files changed, 41 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index a8527ea9a42..49aef3ec33e 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -21,11 +21,11 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); - for (&key, &(a, b, c)) in data.iter() { + for (&key, &[a, b, c]) in data.iter() { let key = char::from_u32(key).unwrap(); if key.is_ascii() { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index c9530fec48a..1d70eebdbc6 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -100,11 +100,11 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, - to_upper: BTreeMap, - to_lower: BTreeMap, + to_upper: BTreeMap, + to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { +fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { let mut a = None; let mut b = None; let mut c = None; @@ -125,7 +125,7 @@ fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32 } } - Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) + Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -165,12 +165,12 @@ fn load_data() -> UnicodeData { if let Some(mapped) = row.simple_lowercase_mapping && mapped != row.codepoint { - to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } if let Some(mapped) = row.simple_uppercase_mapping && mapped != row.codepoint { - to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } } @@ -224,7 +224,7 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(ranges_by_property).unwrap()).unwrap(); + std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); } let mut table_file = String::new(); @@ -328,7 +328,7 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(ranges: &[(&str, Vec>)]) -> Result { +fn generate_tests(data: &UnicodeData) -> Result { let mut s = String::new(); writeln!(s, "#![feature(core_intrinsics)]")?; writeln!(s, "#![allow(internal_features, dead_code)]")?; @@ -336,7 +336,7 @@ fn generate_tests(ranges: &[(&str, Vec>)]) -> Result>)]) -> Result = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .filter(|c| !conversion.contains_key(c)) + .collect(); + let unmapped_ranges = ranges_from_set(&unmapped); + for range in unmapped_ranges { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# + )?; + + writeln!(s, " }}")?; + } + } + writeln!(s, "}}")?; Ok(s) } -- cgit 1.4.1-3-g733a5 From b951b5dca116803b89380aae55ce9053a4674f31 Mon Sep 17 00:00:00 2001 From: Kivooeo Date: Fri, 15 Aug 2025 16:56:11 +0000 Subject: stabilize strict provenance atomic ptr --- library/core/src/sync/atomic.rs | 21 +++++++-------------- library/coretests/tests/lib.rs | 1 - library/std/src/lib.rs | 1 - src/tools/miri/tests/pass/atomic.rs | 1 - tests/codegen-llvm/atomicptr.rs | 1 - 5 files changed, 7 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/library/core/src/sync/atomic.rs b/library/core/src/sync/atomic.rs index 44a6895f90a..7bd68bcd0bc 100644 --- a/library/core/src/sync/atomic.rs +++ b/library/core/src/sync/atomic.rs @@ -2199,7 +2199,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let atom = AtomicPtr::::new(core::ptr::null_mut()); @@ -2209,7 +2208,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_ptr_add(&self, val: usize, order: Ordering) -> *mut T { self.fetch_byte_add(val.wrapping_mul(size_of::()), order) @@ -2240,7 +2239,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let array = [1i32, 2i32]; @@ -2254,7 +2252,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_ptr_sub(&self, val: usize, order: Ordering) -> *mut T { self.fetch_byte_sub(val.wrapping_mul(size_of::()), order) @@ -2279,7 +2277,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let atom = AtomicPtr::::new(core::ptr::null_mut()); @@ -2289,7 +2286,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_byte_add(&self, val: usize, order: Ordering) -> *mut T { // SAFETY: data races are prevented by atomic intrinsics. @@ -2315,7 +2312,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let mut arr = [0i64, 1]; @@ -2325,7 +2321,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_byte_sub(&self, val: usize, order: Ordering) -> *mut T { // SAFETY: data races are prevented by atomic intrinsics. @@ -2361,7 +2357,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let pointer = &mut 3i64 as *mut i64; @@ -2376,7 +2371,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_or(&self, val: usize, order: Ordering) -> *mut T { // SAFETY: data races are prevented by atomic intrinsics. @@ -2412,7 +2407,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let pointer = &mut 3i64 as *mut i64; @@ -2426,7 +2420,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_and(&self, val: usize, order: Ordering) -> *mut T { // SAFETY: data races are prevented by atomic intrinsics. @@ -2462,7 +2456,6 @@ impl AtomicPtr { /// # Examples /// /// ``` - /// #![feature(strict_provenance_atomic_ptr)] /// use core::sync::atomic::{AtomicPtr, Ordering}; /// /// let pointer = &mut 3i64 as *mut i64; @@ -2474,7 +2467,7 @@ impl AtomicPtr { /// ``` #[inline] #[cfg(target_has_atomic = "ptr")] - #[unstable(feature = "strict_provenance_atomic_ptr", issue = "99108")] + #[stable(feature = "strict_provenance_atomic_ptr", since = "CURRENT_RUSTC_VERSION")] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub fn fetch_xor(&self, val: usize, order: Ordering) -> *mut T { // SAFETY: data races are prevented by atomic intrinsics. diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index 0c4d49f3c99..92774e12681 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -95,7 +95,6 @@ #![feature(std_internals)] #![feature(step_trait)] #![feature(str_internals)] -#![feature(strict_provenance_atomic_ptr)] #![feature(strict_provenance_lints)] #![feature(test)] #![feature(trusted_len)] diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index f111fcb4a47..77c7878ce08 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -370,7 +370,6 @@ #![feature(slice_range)] #![feature(std_internals)] #![feature(str_internals)] -#![feature(strict_provenance_atomic_ptr)] #![feature(sync_unsafe_cell)] #![feature(temporary_niche_types)] #![feature(ub_checks)] diff --git a/src/tools/miri/tests/pass/atomic.rs b/src/tools/miri/tests/pass/atomic.rs index 3de34e570c7..d8ac5114f27 100644 --- a/src/tools/miri/tests/pass/atomic.rs +++ b/src/tools/miri/tests/pass/atomic.rs @@ -2,7 +2,6 @@ //@[tree]compile-flags: -Zmiri-tree-borrows //@compile-flags: -Zmiri-strict-provenance -#![feature(strict_provenance_atomic_ptr)] // FIXME(static_mut_refs): Do not allow `static_mut_refs` lint #![allow(static_mut_refs)] diff --git a/tests/codegen-llvm/atomicptr.rs b/tests/codegen-llvm/atomicptr.rs index ce6c4aa0d2b..9d5e618fe76 100644 --- a/tests/codegen-llvm/atomicptr.rs +++ b/tests/codegen-llvm/atomicptr.rs @@ -6,7 +6,6 @@ //@ compile-flags: -Copt-level=3 -Cno-prepopulate-passes #![crate_type = "lib"] -#![feature(strict_provenance_atomic_ptr)] use std::ptr::without_provenance_mut; use std::sync::atomic::AtomicPtr; -- cgit 1.4.1-3-g733a5 From 1bb9b151c9f9b5116254827f04add845aff33408 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Mon, 11 Aug 2025 00:57:29 +0000 Subject: refactor: Hard-code `char::is_control` According to https://www.unicode.org/policies/stability_policy.html#Property_Value, the set of codepoints in `Cc` will never change. So we can hard-code the patterns to match against instead of using a table. --- library/core/src/char/methods.rs | 6 +++++- library/core/src/unicode/mod.rs | 1 - library/core/src/unicode/unicode_data.rs | 25 ------------------------- src/tools/unicode-table-generator/src/main.rs | 1 - 4 files changed, 5 insertions(+), 28 deletions(-) (limited to 'src') diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 7ee0962721f..61ac7f8a339 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -950,7 +950,11 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_control(self) -> bool { - unicode::Cc(self) + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Cc` will never change. + // So we can just hard-code the patterns to match against instead of using a table. + matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } /// Returns `true` if this `char` has the `Grapheme_Extend` property. diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 49dbdeb1a6d..e1cb69c3c4f 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -9,7 +9,6 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; -pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::n::lookup as N; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9..55f64f1e96e 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -358,31 +358,6 @@ pub mod cased { } } -#[rustfmt::skip] -pub mod cc { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [ - ShortOffsetRunHeader::new(0, 1114272), - ]; - static OFFSETS: [u8; 5] = [ - 0, 32, 95, 33, 0, - ]; - pub fn lookup(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } - } -} - #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bd..38e5e8bbdb9 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -92,7 +92,6 @@ static PROPERTIES: &[&str] = &[ "Case_Ignorable", "Grapheme_Extend", "White_Space", - "Cc", "N", ]; -- cgit 1.4.1-3-g733a5 From 224fd13a9c583db4054fc241bd55f38f0b309dbf Mon Sep 17 00:00:00 2001 From: Zhongyao Chen Date: Mon, 18 Aug 2025 10:33:53 +0800 Subject: change riscv64a23 target reqirements comments: linux kernel 6.8.0, glibc 2.39 --- .../rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs | 2 +- src/doc/rustc/src/platform-support.md | 2 +- src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs b/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs index 27a0e27c969..394f8b0685d 100644 --- a/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs +++ b/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs @@ -6,7 +6,7 @@ pub(crate) fn target() -> Target { Target { llvm_target: "riscv64-unknown-linux-gnu".into(), metadata: TargetMetadata { - description: Some("RISC-V Linux (kernel 4.20, glibc 2.29)".into()), + description: Some("RISC-V Linux (kernel 6.8.0, glibc 2.39)".into()), tier: Some(3), host_tools: Some(true), std: Some(true), diff --git a/src/doc/rustc/src/platform-support.md b/src/doc/rustc/src/platform-support.md index cf55d4bd68f..891932a1b6a 100644 --- a/src/doc/rustc/src/platform-support.md +++ b/src/doc/rustc/src/platform-support.md @@ -389,7 +389,7 @@ target | std | host | notes [`riscv64gc-unknown-nuttx-elf`](platform-support/nuttx.md) | ✓ | | RISC-V 64bit with NuttX [`riscv64gc-unknown-openbsd`](platform-support/openbsd.md) | ✓ | ✓ | OpenBSD/riscv64 [`riscv64imac-unknown-nuttx-elf`](platform-support/nuttx.md) | ✓ | | RISC-V 64bit with NuttX -[`riscv64a23-unknown-linux-gnu`](platform-support/riscv64a23-unknown-linux-gnu.md) | ✓ | ✓ | RISC-V Linux (kernel 4.20+, glibc 2.29) +[`riscv64a23-unknown-linux-gnu`](platform-support/riscv64a23-unknown-linux-gnu.md) | ✓ | ✓ | RISC-V Linux (kernel 6.8.0+, glibc 2.39) [`s390x-unknown-linux-musl`](platform-support/s390x-unknown-linux-musl.md) | ✓ | | S390x Linux (kernel 3.2, musl 1.2.3) `sparc-unknown-linux-gnu` | ✓ | | 32-bit SPARC Linux [`sparc-unknown-none-elf`](./platform-support/sparc-unknown-none-elf.md) | * | | Bare 32-bit SPARC V7+ diff --git a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md index 6db4a672936..a13796e0cf8 100644 --- a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md +++ b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md @@ -12,10 +12,9 @@ This target will enable all mandary features of rva23u64 and rva23s64 by default ## Requirements -This target requires: +This target can be sucessfully build on the following platform: ubuntu 24.04 (Linux Kernel version 6.8.0, glibc 2.39). -* Linux Kernel version 4.20 or later -* glibc 2.17 or later +Other platforms may work, but are not tested. Please contanct if you encounter any issues. ## Building the target -- cgit 1.4.1-3-g733a5 From ce07b5d3fc3d3bac072527ba29900f2238c11aba Mon Sep 17 00:00:00 2001 From: Zhongyao Chen Date: Tue, 19 Aug 2025 08:58:48 +0800 Subject: remove rva23s64 from riscv64a23-unknown-linux-gnu target --- compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs | 2 +- src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs b/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs index 394f8b0685d..60f2e7da042 100644 --- a/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs +++ b/compiler/rustc_target/src/spec/targets/riscv64a23_unknown_linux_gnu.rs @@ -17,7 +17,7 @@ pub(crate) fn target() -> Target { options: TargetOptions { code_model: Some(CodeModel::Medium), cpu: "generic-rv64".into(), - features: "+rva23u64,+rva23s64".into(), + features: "+rva23u64".into(), llvm_abiname: "lp64d".into(), max_atomic_width: Some(64), supported_split_debuginfo: Cow::Borrowed(&[SplitDebuginfo::Off]), diff --git a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md index a13796e0cf8..2cbaaa86654 100644 --- a/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md +++ b/src/doc/rustc/src/platform-support/riscv64a23-unknown-linux-gnu.md @@ -3,7 +3,7 @@ **Tier: 3** RISC-V target using the ratified [RVA23 Profile](https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc). -This target will enable all mandary features of rva23u64 and rva23s64 by default. +This target will enable all mandary features of rva23u64 by default. ## Target maintainers -- cgit 1.4.1-3-g733a5 From 7d09ce84ee0b9b63e44ec90241ff9e6584068164 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:50:54 +0000 Subject: Bump slab from 0.4.10 to 0.4.11 in /tests/deps Bumps [slab](https://github.com/tokio-rs/slab) from 0.4.10 to 0.4.11. - [Release notes](https://github.com/tokio-rs/slab/releases) - [Changelog](https://github.com/tokio-rs/slab/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/slab/compare/v0.4.10...v0.4.11) --- updated-dependencies: - dependency-name: slab dependency-version: 0.4.11 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- src/tools/miri/tests/deps/Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/tools/miri/tests/deps/Cargo.lock b/src/tools/miri/tests/deps/Cargo.lock index 4b783ebdc4e..65ca4215c60 100644 --- a/src/tools/miri/tests/deps/Cargo.lock +++ b/src/tools/miri/tests/deps/Cargo.lock @@ -296,9 +296,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "socket2" -- cgit 1.4.1-3-g733a5 From b51a8ed2439e21f2980160bdf6af043e42d55434 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Fri, 8 Aug 2025 16:31:53 -0500 Subject: test(rustfmt): Verify frontmatter is preserved This is to prove that the frontmatter is preserved. The choices in tests is intended for showing the different parts of the proposed Style Guide for frontmatters. --- src/tools/rustfmt/tests/source/frontmatter_compact.rs | 8 ++++++++ src/tools/rustfmt/tests/source/frontmatter_escaped.rs | 13 +++++++++++++ src/tools/rustfmt/tests/source/frontmatter_spaced.rs | 16 ++++++++++++++++ src/tools/rustfmt/tests/target/frontmatter_compact.rs | 8 ++++++++ src/tools/rustfmt/tests/target/frontmatter_escaped.rs | 13 +++++++++++++ src/tools/rustfmt/tests/target/frontmatter_spaced.rs | 16 ++++++++++++++++ 6 files changed, 74 insertions(+) create mode 100644 src/tools/rustfmt/tests/source/frontmatter_compact.rs create mode 100644 src/tools/rustfmt/tests/source/frontmatter_escaped.rs create mode 100644 src/tools/rustfmt/tests/source/frontmatter_spaced.rs create mode 100644 src/tools/rustfmt/tests/target/frontmatter_compact.rs create mode 100644 src/tools/rustfmt/tests/target/frontmatter_escaped.rs create mode 100644 src/tools/rustfmt/tests/target/frontmatter_spaced.rs (limited to 'src') diff --git a/src/tools/rustfmt/tests/source/frontmatter_compact.rs b/src/tools/rustfmt/tests/source/frontmatter_compact.rs new file mode 100644 index 00000000000..21d4c6f4b61 --- /dev/null +++ b/src/tools/rustfmt/tests/source/frontmatter_compact.rs @@ -0,0 +1,8 @@ +#!/usr/bin/env cargo +---identifier +[dependencies] +regex = "1" +--- +#![feature(frontmatter)] + +fn main() {} diff --git a/src/tools/rustfmt/tests/source/frontmatter_escaped.rs b/src/tools/rustfmt/tests/source/frontmatter_escaped.rs new file mode 100644 index 00000000000..0d026377566 --- /dev/null +++ b/src/tools/rustfmt/tests/source/frontmatter_escaped.rs @@ -0,0 +1,13 @@ +#!/usr/bin/env cargo +------------ +package.description = """ +Header +----- + +Body +""" +------------ + +#![feature(frontmatter)] + +fn main() {} diff --git a/src/tools/rustfmt/tests/source/frontmatter_spaced.rs b/src/tools/rustfmt/tests/source/frontmatter_spaced.rs new file mode 100644 index 00000000000..ee0bb81705c --- /dev/null +++ b/src/tools/rustfmt/tests/source/frontmatter_spaced.rs @@ -0,0 +1,16 @@ +#!/usr/bin/env cargo + + +--- identifier +[dependencies] +regex = "1" + +--- + + + + + +#![feature(frontmatter)] + +fn main() {} diff --git a/src/tools/rustfmt/tests/target/frontmatter_compact.rs b/src/tools/rustfmt/tests/target/frontmatter_compact.rs new file mode 100644 index 00000000000..21d4c6f4b61 --- /dev/null +++ b/src/tools/rustfmt/tests/target/frontmatter_compact.rs @@ -0,0 +1,8 @@ +#!/usr/bin/env cargo +---identifier +[dependencies] +regex = "1" +--- +#![feature(frontmatter)] + +fn main() {} diff --git a/src/tools/rustfmt/tests/target/frontmatter_escaped.rs b/src/tools/rustfmt/tests/target/frontmatter_escaped.rs new file mode 100644 index 00000000000..0d026377566 --- /dev/null +++ b/src/tools/rustfmt/tests/target/frontmatter_escaped.rs @@ -0,0 +1,13 @@ +#!/usr/bin/env cargo +------------ +package.description = """ +Header +----- + +Body +""" +------------ + +#![feature(frontmatter)] + +fn main() {} diff --git a/src/tools/rustfmt/tests/target/frontmatter_spaced.rs b/src/tools/rustfmt/tests/target/frontmatter_spaced.rs new file mode 100644 index 00000000000..ee0bb81705c --- /dev/null +++ b/src/tools/rustfmt/tests/target/frontmatter_spaced.rs @@ -0,0 +1,16 @@ +#!/usr/bin/env cargo + + +--- identifier +[dependencies] +regex = "1" + +--- + + + + + +#![feature(frontmatter)] + +fn main() {} -- cgit 1.4.1-3-g733a5 From 2914291e09cb13aab64207f9e11f2aaf74de3904 Mon Sep 17 00:00:00 2001 From: ltdk Date: Wed, 13 Aug 2025 01:16:42 -0400 Subject: Move WTF-8 code from std to core/alloc --- library/alloc/src/lib.rs | 3 + library/alloc/src/wtf8/mod.rs | 674 +++++------------------------------- library/alloc/src/wtf8/tests.rs | 209 +++++------ library/alloctests/lib.rs | 12 +- library/alloctests/tests/lib.rs | 1 + library/core/src/lib.rs | 2 + library/core/src/num/niche_types.rs | 15 + library/core/src/wtf8.rs | 642 +++++----------------------------- library/coretests/tests/lib.rs | 1 + library/coretests/tests/wtf8.rs | 1 + library/std/src/lib.rs | 1 + library/std/src/os/windows/ffi.rs | 38 +- library/std/src/sys/os_str/wtf8.rs | 8 +- library/std/src/sys_common/mod.rs | 1 - src/tools/tidy/src/unit_tests.rs | 1 + 15 files changed, 331 insertions(+), 1278 deletions(-) create mode 100644 library/coretests/tests/wtf8.rs (limited to 'src') diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 639c5d4c930..711092ae8eb 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -159,6 +159,7 @@ #![feature(unicode_internals)] #![feature(unsize)] #![feature(unwrap_infallible)] +#![feature(wtf8_internals)] // tidy-alphabetical-end // // Language features: @@ -232,6 +233,8 @@ pub mod sync; #[cfg(all(not(no_global_oom_handling), not(no_rc), not(no_sync)))] pub mod task; pub mod vec; +#[cfg(all(not(no_rc), not(no_sync), not(no_global_oom_handling)))] +pub mod wtf8; #[doc(hidden)] #[unstable(feature = "liballoc_internals", issue = "none", reason = "implementation detail")] diff --git a/library/alloc/src/wtf8/mod.rs b/library/alloc/src/wtf8/mod.rs index 50bde88b5a4..95d317a5efb 100644 --- a/library/alloc/src/wtf8/mod.rs +++ b/library/alloc/src/wtf8/mod.rs @@ -1,135 +1,42 @@ -//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). -//! -//! This library uses Rust’s type system to maintain -//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), -//! like the `String` and `&str` types do for UTF-8. -//! -//! Since [WTF-8 must not be used -//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), -//! this library deliberately does not provide access to the underlying bytes -//! of WTF-8 strings, -//! nor can it decode WTF-8 from arbitrary bytes. -//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. - -// this module is imported from @SimonSapin's repo and has tons of dead code on -// unix (it's mostly used on windows), so don't worry about dead code here. -#![allow(dead_code)] +//! Heap-allocated counterpart to core `wtf8` module. +#![unstable( + feature = "wtf8_internals", + issue = "none", + reason = "this is internal code for representing OsStr on some platforms and not a public API" +)] +// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait +// implementations, so, we'll have to add more doc(hidden)s anyway +#![doc(hidden)] + +// Note: This module is also included in the alloctests crate using #[path] to +// run the tests. See the comment there for an explanation why this is the case. #[cfg(test)] mod tests; -use core::char::{MAX_LEN_UTF8, MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw}; -use core::clone::CloneToUninit; -use core::str::next_code_point; +use core::char::{MAX_LEN_UTF8, encode_utf8_raw}; +use core::hash::{Hash, Hasher}; +pub use core::wtf8::{CodePoint, Wtf8}; +#[cfg(not(test))] +pub use core::wtf8::{EncodeWide, Wtf8CodePoints}; +use core::{fmt, mem, ops, str}; -use crate::borrow::Cow; +use crate::borrow::{Cow, ToOwned}; +use crate::boxed::Box; use crate::collections::TryReserveError; -use crate::hash::{Hash, Hasher}; -use crate::iter::FusedIterator; +#[cfg(not(test))] use crate::rc::Rc; +use crate::string::String; +#[cfg(all(not(test), target_has_atomic = "ptr"))] use crate::sync::Arc; -use crate::sys_common::AsInner; -use crate::{fmt, mem, ops, slice, str}; - -const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; - -/// A Unicode code point: from U+0000 to U+10FFFF. -/// -/// Compares with the `char` type, -/// which represents a Unicode scalar value: -/// a code point that is not a surrogate (U+D800 to U+DFFF). -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { - value: u32, -} - -/// Format the code point as `U+` followed by four to six hexadecimal digits. -/// Example: `U+1F4A9` -impl fmt::Debug for CodePoint { - #[inline] - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(formatter, "U+{:04X}", self.value) - } -} - -impl CodePoint { - /// Unsafely creates a new `CodePoint` without checking the value. - /// - /// Only use when `value` is known to be less than or equal to 0x10FFFF. - #[inline] - pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { - CodePoint { value } - } - - /// Creates a new `CodePoint` if the value is a valid code point. - /// - /// Returns `None` if `value` is above 0x10FFFF. - #[inline] - pub fn from_u32(value: u32) -> Option { - match value { - 0..=0x10FFFF => Some(CodePoint { value }), - _ => None, - } - } - - /// Creates a new `CodePoint` from a `char`. - /// - /// Since all Unicode scalar values are code points, this always succeeds. - #[inline] - pub fn from_char(value: char) -> CodePoint { - CodePoint { value: value as u32 } - } - - /// Returns the numeric value of the code point. - #[inline] - pub fn to_u32(&self) -> u32 { - self.value - } - - /// Returns the numeric value of the code point if it is a leading surrogate. - #[inline] - pub fn to_lead_surrogate(&self) -> Option { - match self.value { - lead @ 0xD800..=0xDBFF => Some(lead as u16), - _ => None, - } - } - - /// Returns the numeric value of the code point if it is a trailing surrogate. - #[inline] - pub fn to_trail_surrogate(&self) -> Option { - match self.value { - trail @ 0xDC00..=0xDFFF => Some(trail as u16), - _ => None, - } - } - - /// Optionally returns a Unicode scalar value for the code point. - /// - /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char(&self) -> Option { - match self.value { - 0xD800..=0xDFFF => None, - _ => Some(unsafe { char::from_u32_unchecked(self.value) }), - } - } - - /// Returns a Unicode scalar value for the code point. - /// - /// Returns `'\u{FFFD}'` (the replacement character “�”) - /// if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char_lossy(&self) -> char { - self.to_char().unwrap_or('\u{FFFD}') - } -} +use crate::vec::Vec; /// An owned, growable string of well-formed WTF-8 data. /// /// Similar to `String`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +#[doc(hidden)] pub struct Wtf8Buf { bytes: Vec, @@ -181,6 +88,7 @@ impl fmt::Display for Wtf8Buf { } } +#[cfg_attr(test, allow(dead_code))] impl Wtf8Buf { /// Creates a new, empty WTF-8 string. #[inline] @@ -245,7 +153,9 @@ impl Wtf8Buf { string.is_known_utf8 = false; // Skip the WTF-8 concatenation check, // surrogate pairs are already decoded by decode_utf16 - string.push_code_point_unchecked(code_point); + unsafe { + string.push_code_point_unchecked(code_point); + } } } } @@ -255,9 +165,9 @@ impl Wtf8Buf { /// Appends the given `char` to the end of this string. /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. /// Copied from String::push. - fn push_code_point_unchecked(&mut self, code_point: CodePoint) { + unsafe fn push_code_point_unchecked(&mut self, code_point: CodePoint) { let mut bytes = [0; MAX_LEN_UTF8]; - let bytes = encode_utf8_raw(code_point.value, &mut bytes); + let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes); self.bytes.extend_from_slice(bytes) } @@ -379,7 +289,7 @@ impl Wtf8Buf { (Some(lead), Some(trail)) => { let len_without_lead_surrogate = self.len() - 3; self.bytes.truncate(len_without_lead_surrogate); - let other_without_trail_surrogate = &other.bytes[3..]; + let other_without_trail_surrogate = &other.as_bytes()[3..]; // 4 bytes for the supplementary code point self.bytes.reserve(4 + other_without_trail_surrogate.len()); self.push_char(decode_surrogate_pair(lead, trail)); @@ -392,7 +302,7 @@ impl Wtf8Buf { self.is_known_utf8 = false; } - self.bytes.extend_from_slice(&other.bytes); + self.bytes.extend_from_slice(other.as_bytes()); } } } @@ -400,7 +310,8 @@ impl Wtf8Buf { /// Append a Unicode scalar value at the end of the string. #[inline] pub fn push_char(&mut self, c: char) { - self.push_code_point_unchecked(CodePoint::from_char(c)) + // SAFETY: It's always safe to push a char. + unsafe { self.push_code_point_unchecked(CodePoint::from_char(c)) } } /// Append a code point at the end of the string. @@ -426,7 +337,7 @@ impl Wtf8Buf { } // No newly paired surrogates at the boundary. - self.push_code_point_unchecked(code_point) + unsafe { self.push_code_point_unchecked(code_point) } } /// Shortens a string to the specified length. @@ -437,7 +348,7 @@ impl Wtf8Buf { /// or if `new_len` is not a code point boundary. #[inline] pub fn truncate(&mut self, new_len: usize) { - assert!(is_code_point_boundary(self, new_len)); + assert!(self.is_code_point_boundary(new_len)); self.bytes.truncate(new_len) } @@ -474,8 +385,7 @@ impl Wtf8Buf { pos = surrogate_pos + 3; // Surrogates and the replacement character are all 3 bytes, so // they can substituted in-place. - self.bytes[surrogate_pos..pos] - .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + self.bytes[surrogate_pos..pos].copy_from_slice("\u{FFFD}".as_bytes()); } } unsafe { String::from_utf8_unchecked(self.bytes) } @@ -498,7 +408,7 @@ impl Wtf8Buf { /// More well behaving alternative to allowing outer types /// full mutable access to the core `Vec`. #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { + pub unsafe fn extend_from_slice_unchecked(&mut self, other: &[u8]) { self.bytes.extend_from_slice(other); self.is_known_utf8 = false; } @@ -541,153 +451,13 @@ impl Extend for Wtf8Buf { } } -/// A borrowed slice of well-formed WTF-8 data. -/// -/// Similar to `&str`, but can additionally contain surrogate code points -/// if they’re not in a surrogate pair. -#[derive(Eq, Ord, PartialEq, PartialOrd)] -#[repr(transparent)] -pub struct Wtf8 { - bytes: [u8], -} - -impl AsInner<[u8]> for Wtf8 { - #[inline] - fn as_inner(&self) -> &[u8] { - &self.bytes - } -} - -/// Formats the string in double quotes, with characters escaped according to -/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`, -/// where each `x` is a hexadecimal digit. -impl fmt::Debug for Wtf8 { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result { - use crate::fmt::Write; - for c in s.chars().flat_map(|c| c.escape_debug()) { - f.write_char(c)? - } - Ok(()) - } - - formatter.write_str("\"")?; - let mut pos = 0; - while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { - write_str_escaped(formatter, unsafe { - str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) - })?; - write!(formatter, "\\u{{{:x}}}", surrogate)?; - pos = surrogate_pos + 3; - } - write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; - formatter.write_str("\"") - } -} - -/// Formats the string with unpaired surrogates substituted with the replacement -/// character, U+FFFD. -impl fmt::Display for Wtf8 { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - let wtf8_bytes = &self.bytes; - let mut pos = 0; - loop { - match self.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - formatter.write_str(unsafe { - str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) - })?; - formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; - pos = surrogate_pos + 3; - } - None => { - let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; - if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) } - } - } - } - } -} - -impl Wtf8 { - /// Creates a WTF-8 slice from a UTF-8 `&str` slice. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub fn from_str(value: &str) -> &Wtf8 { - unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } - } - - /// Creates a WTF-8 slice from a WTF-8 byte slice. - /// - /// Since the byte slice is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { - // SAFETY: start with &[u8], end with fancy &[u8] - unsafe { &*(value as *const [u8] as *const Wtf8) } - } - - /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. - /// - /// Since the byte slice is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { - // SAFETY: start with &mut [u8], end with fancy &mut [u8] - unsafe { &mut *(value as *mut [u8] as *mut Wtf8) } - } - - /// Returns the length, in WTF-8 bytes. - #[inline] - pub fn len(&self) -> usize { - self.bytes.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.bytes.is_empty() - } - - /// Returns the code point at `position` if it is in the ASCII range, - /// or `b'\xFF'` otherwise. - /// - /// # Panics - /// - /// Panics if `position` is beyond the end of the string. - #[inline] - pub fn ascii_byte_at(&self, position: usize) -> u8 { - match self.bytes[position] { - ascii_byte @ 0x00..=0x7F => ascii_byte, - _ => 0xFF, - } - } - - /// Returns an iterator for the string’s code points. - #[inline] - pub fn code_points(&self) -> Wtf8CodePoints<'_> { - Wtf8CodePoints { bytes: self.bytes.iter() } - } - - /// Access raw bytes of WTF-8 data - #[inline] - pub fn as_bytes(&self) -> &[u8] { - &self.bytes - } - - /// Tries to convert the string to UTF-8 and return a `&str` slice. - /// - /// Returns `None` if the string contains surrogates. - /// - /// This does not copy the data. - #[inline] - pub fn as_str(&self) -> Result<&str, str::Utf8Error> { - str::from_utf8(&self.bytes) - } +// helps diff +mod wtf8 { + use super::*; /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. - pub fn to_owned(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false } + pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf { + Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false } } /// Lossily converts the string to UTF-8. @@ -696,20 +466,20 @@ impl Wtf8 { /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). /// /// This only copies the data if necessary (if it contains any surrogate). - pub fn to_string_lossy(&self) -> Cow<'_, str> { - let Some((surrogate_pos, _)) = self.next_surrogate(0) else { - return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }); + pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> { + let Some((surrogate_pos, _)) = slice.next_surrogate(0) else { + return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) }); }; - let wtf8_bytes = &self.bytes; - let mut utf8_bytes = Vec::with_capacity(self.len()); + let wtf8_bytes = slice.as_bytes(); + let mut utf8_bytes = Vec::with_capacity(slice.len()); utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); let mut pos = surrogate_pos + 3; loop { - match self.next_surrogate(pos) { + match slice.next_surrogate(pos) { Some((surrogate_pos, _)) => { utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); pos = surrogate_pos + 3; } None => { @@ -720,342 +490,76 @@ impl Wtf8 { } } - /// Converts the WTF-8 string to potentially ill-formed UTF-16 - /// and return an iterator of 16-bit code units. - /// - /// This is lossless: - /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units - /// would always return the original WTF-8 string. #[inline] - pub fn encode_wide(&self) -> EncodeWide<'_> { - EncodeWide { code_points: self.code_points(), extra: 0 } + pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) { + buf.is_known_utf8 = false; + slice.as_bytes().clone_into(&mut buf.bytes); } +} - #[inline] - fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { - let mut iter = self.bytes[pos..].iter(); - loop { - let b = *iter.next()?; - if b < 0x80 { - pos += 1; - } else if b < 0xE0 { - iter.next(); - pos += 2; - } else if b == 0xED { - match (iter.next(), iter.next()) { - (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { - return Some((pos, decode_surrogate(b2, b3))); - } - _ => pos += 3, - } - } else if b < 0xF0 { - iter.next(); - iter.next(); - pos += 3; - } else { - iter.next(); - iter.next(); - iter.next(); - pos += 4; - } - } - } +use self::wtf8::{to_owned, to_string_lossy, clone_into}; - #[inline] - fn final_lead_surrogate(&self) -> Option { - match self.bytes { - [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), - _ => None, - } +#[cfg(not(test))] +impl Wtf8 { + #[rustc_allow_incoherent_impl] + pub fn to_owned(&self) -> Wtf8Buf { + to_owned(self) } - #[inline] - fn initial_trail_surrogate(&self) -> Option { - match self.bytes { - [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), - _ => None, - } + #[rustc_allow_incoherent_impl] + pub fn clone_into(&self, buf: &mut Wtf8Buf) { + clone_into(self, buf) } - pub fn clone_into(&self, buf: &mut Wtf8Buf) { - buf.is_known_utf8 = false; - self.bytes.clone_into(&mut buf.bytes); + #[rustc_allow_incoherent_impl] + pub fn to_string_lossy(&self) -> Cow<'_, str> { + to_string_lossy(self) } - /// Boxes this `Wtf8`. - #[inline] + #[rustc_allow_incoherent_impl] pub fn into_box(&self) -> Box { - let boxed: Box<[u8]> = self.bytes.into(); + let boxed: Box<[u8]> = self.as_bytes().into(); unsafe { mem::transmute(boxed) } } - /// Creates a boxed, empty `Wtf8`. + #[rustc_allow_incoherent_impl] pub fn empty_box() -> Box { let boxed: Box<[u8]> = Default::default(); unsafe { mem::transmute(boxed) } } - #[inline] + #[cfg(target_has_atomic = "ptr")] + #[rustc_allow_incoherent_impl] pub fn into_arc(&self) -> Arc { - let arc: Arc<[u8]> = Arc::from(&self.bytes); + let arc: Arc<[u8]> = Arc::from(self.as_bytes()); unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } } - #[inline] + #[rustc_allow_incoherent_impl] pub fn into_rc(&self) -> Rc { - let rc: Rc<[u8]> = Rc::from(&self.bytes); + let rc: Rc<[u8]> = Rc::from(self.as_bytes()); unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } } #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.bytes.make_ascii_lowercase() - } - - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.bytes.make_ascii_uppercase() - } - - #[inline] + #[rustc_allow_incoherent_impl] pub fn to_ascii_lowercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false } + Wtf8Buf { bytes: self.as_bytes().to_ascii_lowercase(), is_known_utf8: false } } #[inline] + #[rustc_allow_incoherent_impl] pub fn to_ascii_uppercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false } - } - - #[inline] - pub fn is_ascii(&self) -> bool { - self.bytes.is_ascii() - } - - #[inline] - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.bytes.eq_ignore_ascii_case(&other.bytes) - } -} - -/// Returns a slice of the given string for the byte range \[`begin`..`end`). -/// -/// # Panics -/// -/// Panics when `begin` and `end` do not point to code point boundaries, -/// or point beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::Range) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if range.start <= range.end - && is_code_point_boundary(self, range.start) - && is_code_point_boundary(self, range.end) - { - unsafe { slice_unchecked(self, range.start, range.end) } - } else { - slice_error_fail(self, range.start, range.end) - } - } -} - -/// Returns a slice of the given string from byte `begin` to its end. -/// -/// # Panics -/// -/// Panics when `begin` is not at a code point boundary, -/// or is beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::RangeFrom) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.start) { - unsafe { slice_unchecked(self, range.start, self.len()) } - } else { - slice_error_fail(self, range.start, self.len()) - } + Wtf8Buf { bytes: self.as_bytes().to_ascii_uppercase(), is_known_utf8: false } } } -/// Returns a slice of the given string from its beginning to byte `end`. -/// -/// # Panics -/// -/// Panics when `end` is not at a code point boundary, -/// or is beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::RangeTo) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.end) { - unsafe { slice_unchecked(self, 0, range.end) } - } else { - slice_error_fail(self, 0, range.end) - } - } -} - -impl ops::Index for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, _range: ops::RangeFull) -> &Wtf8 { - self - } -} - -#[inline] -fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { - // The first byte is assumed to be 0xED - 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F -} - #[inline] fn decode_surrogate_pair(lead: u16, trail: u16) -> char { let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); unsafe { char::from_u32_unchecked(code_point) } } -/// Copied from str::is_char_boundary -#[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { - if index == 0 { - return true; - } - match slice.bytes.get(index) { - None => index == slice.len(), - Some(&b) => (b as i8) >= -0x40, - } -} - -/// Verify that `index` is at the edge of either a valid UTF-8 codepoint -/// (i.e. a codepoint that's not a surrogate) or of the whole string. -/// -/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`. -/// Splitting between surrogates is valid as far as WTF-8 is concerned, but -/// we do not permit it in the public API because WTF-8 is considered an -/// implementation detail. -#[track_caller] -#[inline] -pub fn check_utf8_boundary(slice: &Wtf8, index: usize) { - if index == 0 { - return; - } - match slice.bytes.get(index) { - Some(0xED) => (), // Might be a surrogate - Some(&b) if (b as i8) >= -0x40 => return, - Some(_) => panic!("byte index {index} is not a codepoint boundary"), - None if index == slice.len() => return, - None => panic!("byte index {index} is out of bounds"), - } - if slice.bytes[index + 1] >= 0xA0 { - // There's a surrogate after index. Now check before index. - if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 { - panic!("byte index {index} lies between surrogate codepoints"); - } - } -} - -/// Copied from core::str::raw::slice_unchecked -#[inline] -pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { - // SAFETY: memory layout of a &[u8] and &Wtf8 are the same - unsafe { - let len = end - begin; - let start = s.as_bytes().as_ptr().add(begin); - Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len)) - } -} - -/// Copied from core::str::raw::slice_error_fail -#[inline(never)] -pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { - assert!(begin <= end); - panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); -} - -/// Iterator for the code points of a WTF-8 string. -/// -/// Created with the method `.code_points()`. -#[derive(Clone)] -pub struct Wtf8CodePoints<'a> { - bytes: slice::Iter<'a, u8>, -} - -impl Iterator for Wtf8CodePoints<'_> { - type Item = CodePoint; - - #[inline] - fn next(&mut self) -> Option { - // SAFETY: `self.bytes` has been created from a WTF-8 string - unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let len = self.bytes.len(); - (len.saturating_add(3) / 4, Some(len)) - } -} - -/// Generates a wide character sequence for potentially ill-formed UTF-16. -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Clone)] -pub struct EncodeWide<'a> { - code_points: Wtf8CodePoints<'a>, - extra: u16, -} - -// Copied from libunicode/u_str.rs -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for EncodeWide<'_> { - type Item = u16; - - #[inline] - fn next(&mut self) -> Option { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); - } - - let mut buf = [0; MAX_LEN_UTF16]; - self.code_points.next().map(|code_point| { - let n = encode_utf16_raw(code_point.value, &mut buf).len(); - if n == 2 { - self.extra = buf[1]; - } - buf[0] - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.code_points.size_hint(); - let ext = (self.extra != 0) as usize; - // every code point gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext))) - } -} - -#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")] -impl FusedIterator for EncodeWide<'_> {} - -impl Hash for CodePoint { - #[inline] - fn hash(&self, state: &mut H) { - self.value.hash(state) - } -} - impl Hash for Wtf8Buf { #[inline] fn hash(&self, state: &mut H) { @@ -1063,21 +567,3 @@ impl Hash for Wtf8Buf { 0xfeu8.hash(state) } } - -impl Hash for Wtf8 { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) - } -} - -#[unstable(feature = "clone_to_uninit", issue = "126799")] -unsafe impl CloneToUninit for Wtf8 { - #[inline] - #[cfg_attr(debug_assertions, track_caller)] - unsafe fn clone_to_uninit(&self, dst: *mut u8) { - // SAFETY: we're just a transparent wrapper around [u8] - unsafe { self.bytes.clone_to_uninit(dst) } - } -} diff --git a/library/alloc/src/wtf8/tests.rs b/library/alloc/src/wtf8/tests.rs index b57c99a8452..291f63f9f9e 100644 --- a/library/alloc/src/wtf8/tests.rs +++ b/library/alloc/src/wtf8/tests.rs @@ -1,3 +1,5 @@ +use realalloc::string::ToString; + use super::*; #[test] @@ -82,82 +84,85 @@ fn code_point_to_char_lossy() { #[test] fn wtf8buf_new() { - assert_eq!(Wtf8Buf::new().bytes, b""); + assert_eq!(Wtf8Buf::new().as_bytes(), b""); } #[test] fn wtf8buf_from_str() { - assert_eq!(Wtf8Buf::from_str("").bytes, b""); - assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(Wtf8Buf::from_str("").as_bytes(), b""); + assert_eq!(Wtf8Buf::from_str("aé 💩").as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); } #[test] fn wtf8buf_from_string() { - assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); - assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(Wtf8Buf::from_string(String::from("")).as_bytes(), b""); + assert_eq!( + Wtf8Buf::from_string(String::from("aé 💩")).as_bytes(), + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); } #[test] fn wtf8buf_from_wide() { let buf = Wtf8Buf::from_wide(&[]); - assert_eq!(buf.bytes, b""); + assert_eq!(buf.as_bytes(), b""); assert!(buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xDCA9]); - assert_eq!(buf.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(buf.as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); assert!(buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); - assert_eq!(buf.bytes, b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); + assert_eq!(buf.as_bytes(), b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); assert!(!buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0xD800]); - assert_eq!(buf.bytes, b"\xED\xA0\x80"); + assert_eq!(buf.as_bytes(), b"\xED\xA0\x80"); assert!(!buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0xDBFF]); - assert_eq!(buf.bytes, b"\xED\xAF\xBF"); + assert_eq!(buf.as_bytes(), b"\xED\xAF\xBF"); assert!(!buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0xDC00]); - assert_eq!(buf.bytes, b"\xED\xB0\x80"); + assert_eq!(buf.as_bytes(), b"\xED\xB0\x80"); assert!(!buf.is_known_utf8); let buf = Wtf8Buf::from_wide(&[0xDFFF]); - assert_eq!(buf.bytes, b"\xED\xBF\xBF"); + assert_eq!(buf.as_bytes(), b"\xED\xBF\xBF"); assert!(!buf.is_known_utf8); } #[test] fn wtf8buf_push_str() { let mut string = Wtf8Buf::new(); - assert_eq!(string.bytes, b""); + assert_eq!(string.as_bytes(), b""); assert!(string.is_known_utf8); string.push_str("aé 💩"); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); assert!(string.is_known_utf8); } #[test] fn wtf8buf_push_char() { let mut string = Wtf8Buf::from_str("aé "); - assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 "); assert!(string.is_known_utf8); string.push_char('💩'); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); assert!(string.is_known_utf8); } #[test] fn wtf8buf_push() { let mut string = Wtf8Buf::from_str("aé "); - assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 "); assert!(string.is_known_utf8); string.push(CodePoint::from_char('💩')); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); assert!(string.is_known_utf8); fn c(value: u32) -> CodePoint { @@ -168,53 +173,53 @@ fn wtf8buf_push() { string.push(c(0xD83D)); // lead assert!(!string.is_known_utf8); string.push(c(0xDCA9)); // trail - assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!(string.as_bytes(), b"\xF0\x9F\x92\xA9"); // Magic! let mut string = Wtf8Buf::new(); string.push(c(0xD83D)); // lead assert!(!string.is_known_utf8); string.push(c(0x20)); // not surrogate string.push(c(0xDCA9)); // trail - assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + assert_eq!(string.as_bytes(), b"\xED\xA0\xBD \xED\xB2\xA9"); let mut string = Wtf8Buf::new(); string.push(c(0xD800)); // lead assert!(!string.is_known_utf8); string.push(c(0xDBFF)); // lead - assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80\xED\xAF\xBF"); let mut string = Wtf8Buf::new(); string.push(c(0xD800)); // lead assert!(!string.is_known_utf8); string.push(c(0xE000)); // not surrogate - assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80\xEE\x80\x80"); let mut string = Wtf8Buf::new(); string.push(c(0xD7FF)); // not surrogate assert!(string.is_known_utf8); string.push(c(0xDC00)); // trail assert!(!string.is_known_utf8); - assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\xED\x9F\xBF\xED\xB0\x80"); let mut string = Wtf8Buf::new(); string.push(c(0x61)); // not surrogate, < 3 bytes assert!(string.is_known_utf8); string.push(c(0xDC00)); // trail assert!(!string.is_known_utf8); - assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\x61\xED\xB0\x80"); let mut string = Wtf8Buf::new(); string.push(c(0xDC00)); // trail assert!(!string.is_known_utf8); - assert_eq!(string.bytes, b"\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\xED\xB0\x80"); } #[test] fn wtf8buf_push_wtf8() { let mut string = Wtf8Buf::from_str("aé"); - assert_eq!(string.bytes, b"a\xC3\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9"); string.push_wtf8(Wtf8::from_str(" 💩")); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); assert!(string.is_known_utf8); fn w(v: &[u8]) -> &Wtf8 { @@ -224,42 +229,42 @@ fn wtf8buf_push_wtf8() { let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\xBD")); // lead string.push_wtf8(w(b"\xED\xB2\xA9")); // trail - assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!(string.as_bytes(), b"\xF0\x9F\x92\xA9"); // Magic! let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\xBD")); // lead string.push_wtf8(w(b" ")); // not surrogate string.push_wtf8(w(b"\xED\xB2\xA9")); // trail - assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + assert_eq!(string.as_bytes(), b"\xED\xA0\xBD \xED\xB2\xA9"); assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\x80")); // lead string.push_wtf8(w(b"\xED\xAF\xBF")); // lead - assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80\xED\xAF\xBF"); assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\x80")); // lead string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate - assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80\xEE\x80\x80"); assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate string.push_wtf8(w(b"\xED\xB0\x80")); // trail - assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\xED\x9F\xBF\xED\xB0\x80"); assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes string.push_wtf8(w(b"\xED\xB0\x80")); // trail - assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\x61\xED\xB0\x80"); assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xB0\x80")); // trail - assert_eq!(string.bytes, b"\xED\xB0\x80"); + assert_eq!(string.as_bytes(), b"\xED\xB0\x80"); assert!(!string.is_known_utf8); } @@ -269,15 +274,15 @@ fn wtf8buf_truncate() { assert!(string.is_known_utf8); string.truncate(3); - assert_eq!(string.bytes, b"a\xC3\xA9"); + assert_eq!(string.as_bytes(), b"a\xC3\xA9"); assert!(string.is_known_utf8); string.truncate(1); - assert_eq!(string.bytes, b"a"); + assert_eq!(string.as_bytes(), b"a"); assert!(string.is_known_utf8); string.truncate(0); - assert_eq!(string.bytes, b""); + assert_eq!(string.as_bytes(), b""); assert!(string.is_known_utf8); } @@ -287,11 +292,11 @@ fn wtf8buf_truncate_around_non_bmp() { assert!(string.is_known_utf8); string.truncate(4); - assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); + assert_eq!(string.as_bytes(), b"\xF0\x9F\x92\xA9"); assert!(string.is_known_utf8); string.truncate(0); - assert_eq!(string.bytes, b""); + assert_eq!(string.as_bytes(), b""); assert!(string.is_known_utf8); } @@ -361,7 +366,7 @@ fn wtf8buf_from_iterator() { Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } ); - assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!(f(&[0xD83D, 0xDCA9]).as_bytes(), b"\xF0\x9F\x92\xA9"); // Magic! assert_eq!( f(&[0xD83D, 0x20, 0xDCA9]), Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } @@ -401,7 +406,7 @@ fn wtf8buf_extend() { Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } ); - assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!(e(&[0xD83D], &[0xDCA9]).as_bytes(), b"\xF0\x9F\x92\xA9"); // Magic! assert_eq!( e(&[0xD83D, 0x20], &[0xDCA9]), Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } @@ -449,8 +454,8 @@ fn wtf8buf_show_str() { #[test] fn wtf8_from_str() { - assert_eq!(&Wtf8::from_str("").bytes, b""); - assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(&Wtf8::from_str("").as_bytes(), b""); + assert_eq!(&Wtf8::from_str("aé 💩").as_bytes(), b"a\xC3\xA9 \xF0\x9F\x92\xA9"); } #[test] @@ -461,7 +466,7 @@ fn wtf8_len() { #[test] fn wtf8_slice() { - assert_eq!(&Wtf8::from_str("aé 💩")[1..4].bytes, b"\xC3\xA9 "); + assert_eq!(&Wtf8::from_str("aé 💩")[1..4].as_bytes(), b"\xC3\xA9 "); } #[test] @@ -472,7 +477,7 @@ fn wtf8_slice_not_code_point_boundary() { #[test] fn wtf8_slice_from() { - assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(&Wtf8::from_str("aé 💩")[1..].as_bytes(), b"\xC3\xA9 \xF0\x9F\x92\xA9"); } #[test] @@ -483,7 +488,7 @@ fn wtf8_slice_from_not_code_point_boundary() { #[test] fn wtf8_slice_to() { - assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 "); + assert_eq!(&Wtf8::from_str("aé 💩")[..4].as_bytes(), b"a\xC3\xA9 "); } #[test] @@ -529,12 +534,12 @@ fn wtf8_as_str() { #[test] fn wtf8_to_string_lossy() { - assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); - assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); + assert_eq!(to_string_lossy(Wtf8::from_str("")), Cow::Borrowed("")); + assert_eq!(to_string_lossy(Wtf8::from_str("aé 💩")), Cow::Borrowed("aé 💩")); let mut string = Wtf8Buf::from_str("aé 💩"); string.push(CodePoint::from_u32(0xD800).unwrap()); let expected: Cow<'_, str> = Cow::Owned(String::from("aé 💩�")); - assert_eq!(string.to_string_lossy(), expected); + assert_eq!(to_string_lossy(&string), expected); } #[test] @@ -548,7 +553,7 @@ fn wtf8_display() { let mut string = Wtf8Buf::from_str("aé 💩"); string.push(CodePoint::from_u32(0xD800).unwrap()); - assert_eq!("aé 💩�", d(string.as_inner())); + assert_eq!("aé 💩�", d(string.as_ref())); } #[test] @@ -577,67 +582,41 @@ fn wtf8_encode_wide_size_hint() { #[test] fn wtf8_clone_into() { let mut string = Wtf8Buf::new(); - Wtf8::from_str("green").clone_into(&mut string); - assert_eq!(string.bytes, b"green"); + clone_into(Wtf8::from_str("green"), &mut string); + assert_eq!(string.as_bytes(), b"green"); let mut string = Wtf8Buf::from_str("green"); - Wtf8::from_str("").clone_into(&mut string); - assert_eq!(string.bytes, b""); + clone_into(Wtf8::from_str(""), &mut string); + assert_eq!(string.as_bytes(), b""); let mut string = Wtf8Buf::from_str("red"); - Wtf8::from_str("green").clone_into(&mut string); - assert_eq!(string.bytes, b"green"); + clone_into(Wtf8::from_str("green"), &mut string); + assert_eq!(string.as_bytes(), b"green"); let mut string = Wtf8Buf::from_str("green"); - Wtf8::from_str("red").clone_into(&mut string); - assert_eq!(string.bytes, b"red"); + clone_into(Wtf8::from_str("red"), &mut string); + assert_eq!(string.as_bytes(), b"red"); let mut string = Wtf8Buf::from_str("green"); assert!(string.is_known_utf8); - unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").clone_into(&mut string) }; - assert_eq!(string.bytes, b"\xED\xA0\x80"); + clone_into(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }, &mut string); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80"); assert!(!string.is_known_utf8); } -#[test] -fn wtf8_to_ascii_lowercase() { - let lowercase = Wtf8::from_str("").to_ascii_lowercase(); - assert_eq!(lowercase.bytes, b""); - - let lowercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_lowercase(); - assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); - - let lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_lowercase() }; - assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); - assert!(!lowercase.is_known_utf8); -} - -#[test] -fn wtf8_to_ascii_uppercase() { - let uppercase = Wtf8::from_str("").to_ascii_uppercase(); - assert_eq!(uppercase.bytes, b""); - - let uppercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_uppercase(); - assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); - - let uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_uppercase() }; - assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); - assert!(!uppercase.is_known_utf8); -} - #[test] fn wtf8_make_ascii_lowercase() { let mut lowercase = Wtf8Buf::from_str(""); lowercase.make_ascii_lowercase(); - assert_eq!(lowercase.bytes, b""); + assert_eq!(lowercase.as_bytes(), b""); let mut lowercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇"); lowercase.make_ascii_lowercase(); - assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); + assert_eq!(lowercase.as_bytes(), b"green grapes! \xf0\x9f\x8d\x87"); - let mut lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + let mut lowercase = to_owned(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }); lowercase.make_ascii_lowercase(); - assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); + assert_eq!(lowercase.as_bytes(), b"\xED\xA0\x80"); assert!(!lowercase.is_known_utf8); } @@ -645,22 +624,22 @@ fn wtf8_make_ascii_lowercase() { fn wtf8_make_ascii_uppercase() { let mut uppercase = Wtf8Buf::from_str(""); uppercase.make_ascii_uppercase(); - assert_eq!(uppercase.bytes, b""); + assert_eq!(uppercase.as_bytes(), b""); let mut uppercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇"); uppercase.make_ascii_uppercase(); - assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); + assert_eq!(uppercase.as_bytes(), b"GREEN GRAPES! \xf0\x9f\x8d\x87"); - let mut uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + let mut uppercase = to_owned(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }); uppercase.make_ascii_uppercase(); - assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); + assert_eq!(uppercase.as_bytes(), b"\xED\xA0\x80"); assert!(!uppercase.is_known_utf8); } #[test] fn wtf8_to_owned() { - let string = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; - assert_eq!(string.bytes, b"\xED\xA0\x80"); + let string = to_owned(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }); + assert_eq!(string.as_bytes(), b"\xED\xA0\x80"); assert!(!string.is_known_utf8); } @@ -669,44 +648,44 @@ fn wtf8_valid_utf8_boundaries() { let mut string = Wtf8Buf::from_str("aé 💩"); string.push(CodePoint::from_u32(0xD800).unwrap()); string.push(CodePoint::from_u32(0xD800).unwrap()); - check_utf8_boundary(&string, 0); - check_utf8_boundary(&string, 1); - check_utf8_boundary(&string, 3); - check_utf8_boundary(&string, 4); - check_utf8_boundary(&string, 8); - check_utf8_boundary(&string, 14); + string.check_utf8_boundary(0); + string.check_utf8_boundary(1); + string.check_utf8_boundary(3); + string.check_utf8_boundary(4); + string.check_utf8_boundary(8); + string.check_utf8_boundary(14); assert_eq!(string.len(), 14); string.push_char('a'); - check_utf8_boundary(&string, 14); - check_utf8_boundary(&string, 15); + string.check_utf8_boundary(14); + string.check_utf8_boundary(15); let mut string = Wtf8Buf::from_str("a"); string.push(CodePoint::from_u32(0xD800).unwrap()); - check_utf8_boundary(&string, 1); + string.check_utf8_boundary(1); let mut string = Wtf8Buf::from_str("\u{D7FF}"); string.push(CodePoint::from_u32(0xD800).unwrap()); - check_utf8_boundary(&string, 3); + string.check_utf8_boundary(3); let mut string = Wtf8Buf::new(); string.push(CodePoint::from_u32(0xD800).unwrap()); string.push_char('\u{D7FF}'); - check_utf8_boundary(&string, 3); + string.check_utf8_boundary(3); } #[test] #[should_panic(expected = "byte index 4 is out of bounds")] fn wtf8_utf8_boundary_out_of_bounds() { let string = Wtf8::from_str("aé"); - check_utf8_boundary(&string, 4); + string.check_utf8_boundary(4); } #[test] #[should_panic(expected = "byte index 1 is not a codepoint boundary")] fn wtf8_utf8_boundary_inside_codepoint() { let string = Wtf8::from_str("é"); - check_utf8_boundary(&string, 1); + string.check_utf8_boundary(1); } #[test] @@ -714,7 +693,7 @@ fn wtf8_utf8_boundary_inside_codepoint() { fn wtf8_utf8_boundary_inside_surrogate() { let mut string = Wtf8Buf::new(); string.push(CodePoint::from_u32(0xD800).unwrap()); - check_utf8_boundary(&string, 1); + string.check_utf8_boundary(1); } #[test] @@ -723,20 +702,22 @@ fn wtf8_utf8_boundary_between_surrogates() { let mut string = Wtf8Buf::new(); string.push(CodePoint::from_u32(0xD800).unwrap()); string.push(CodePoint::from_u32(0xD800).unwrap()); - check_utf8_boundary(&string, 3); + string.check_utf8_boundary(3); } #[test] fn wobbled_wtf8_plus_bytes_isnt_utf8() { - let mut string: Wtf8Buf = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + let mut string: Wtf8Buf = to_owned(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }); assert!(!string.is_known_utf8); - string.extend_from_slice(b"some utf-8"); + unsafe { + string.extend_from_slice_unchecked(b"some utf-8"); + } assert!(!string.is_known_utf8); } #[test] fn wobbled_wtf8_plus_str_isnt_utf8() { - let mut string: Wtf8Buf = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + let mut string: Wtf8Buf = to_owned(unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80") }); assert!(!string.is_known_utf8); string.push_str("some utf-8"); assert!(!string.is_known_utf8); diff --git a/library/alloctests/lib.rs b/library/alloctests/lib.rs index 3241b4b0045..97de7d6a701 100644 --- a/library/alloctests/lib.rs +++ b/library/alloctests/lib.rs @@ -18,6 +18,8 @@ #![feature(allocator_api)] #![feature(array_into_iter_constructors)] #![feature(assert_matches)] +#![feature(char_internals)] +#![feature(char_max_len)] #![feature(core_intrinsics)] #![feature(exact_size_is_empty)] #![feature(extend_one)] @@ -41,6 +43,7 @@ #![feature(trusted_random_access)] #![feature(try_reserve_kind)] #![feature(try_trait_v2)] +#![feature(wtf8_internals)] // tidy-alphabetical-end // // Language features: @@ -68,15 +71,18 @@ extern crate test; mod testing; use realalloc::*; -// We are directly including collections and raw_vec here as both use non-public -// methods and fields in tests and as such need to have the types to test in the -// same crate as the tests themself. +// We are directly including collections, raw_vec, and wtf8 here as they use non-public +// methods and fields in tests and as such need to have the types to test in the same +// crate as the tests themself. #[path = "../alloc/src/collections/mod.rs"] mod collections; #[path = "../alloc/src/raw_vec/mod.rs"] mod raw_vec; +#[path = "../alloc/src/wtf8/mod.rs"] +mod wtf8; + #[allow(dead_code)] // Not used in all configurations pub(crate) mod test_helpers { /// Copied from `std::test_helpers::test_rng`, since these tests rely on the diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs index fcfc7f8dd29..447af240a4b 100644 --- a/library/alloctests/tests/lib.rs +++ b/library/alloctests/tests/lib.rs @@ -3,6 +3,7 @@ #![feature(iter_array_chunks)] #![feature(assert_matches)] #![feature(btree_extract_if)] +#![feature(wtf8_internals)] #![feature(char_max_len)] #![feature(cow_is_borrowed)] #![feature(core_intrinsics)] diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 71abd707374..2b6ea7d9afc 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -362,6 +362,8 @@ pub mod slice; pub mod str; pub mod time; +pub mod wtf8; + pub mod unicode; /* Async */ diff --git a/library/core/src/num/niche_types.rs b/library/core/src/num/niche_types.rs index d57b1d433e5..610d9d8cf92 100644 --- a/library/core/src/num/niche_types.rs +++ b/library/core/src/num/niche_types.rs @@ -178,3 +178,18 @@ impl NotAllOnesHelper for u64 { impl NotAllOnesHelper for i64 { type Type = I64NotAllOnes; } + +define_valid_range_type! { + pub struct CodePointInner(u32 as u32 in 0..=0x10ffff); +} + +impl CodePointInner { + pub const ZERO: Self = CodePointInner::new(0).unwrap(); +} + +impl Default for CodePointInner { + #[inline] + fn default() -> Self { + Self::ZERO + } +} diff --git a/library/core/src/wtf8.rs b/library/core/src/wtf8.rs index 50bde88b5a4..5631993dea2 100644 --- a/library/core/src/wtf8.rs +++ b/library/core/src/wtf8.rs @@ -10,28 +10,23 @@ //! of WTF-8 strings, //! nor can it decode WTF-8 from arbitrary bytes. //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. - -// this module is imported from @SimonSapin's repo and has tons of dead code on -// unix (it's mostly used on windows), so don't worry about dead code here. -#![allow(dead_code)] - -#[cfg(test)] -mod tests; - -use core::char::{MAX_LEN_UTF8, MAX_LEN_UTF16, encode_utf8_raw, encode_utf16_raw}; -use core::clone::CloneToUninit; -use core::str::next_code_point; - -use crate::borrow::Cow; -use crate::collections::TryReserveError; +#![unstable( + feature = "wtf8_internals", + issue = "none", + reason = "this is internal code for representing OsStr on some platforms and not a public API" +)] +// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait +// implementations, so, we'll have to add more doc(hidden)s anyway +#![doc(hidden)] + +use crate::char::{MAX_LEN_UTF16, encode_utf16_raw}; +use crate::clone::CloneToUninit; +use crate::fmt::{self, Write}; use crate::hash::{Hash, Hasher}; use crate::iter::FusedIterator; -use crate::rc::Rc; -use crate::sync::Arc; -use crate::sys_common::AsInner; -use crate::{fmt, mem, ops, slice, str}; - -const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; +use crate::num::niche_types::CodePointInner; +use crate::str::next_code_point; +use crate::{ops, slice, str}; /// A Unicode code point: from U+0000 to U+10FFFF. /// @@ -39,16 +34,15 @@ const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; /// which represents a Unicode scalar value: /// a code point that is not a surrogate (U+D800 to U+DFFF). #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { - value: u32, -} +#[doc(hidden)] +pub struct CodePoint(CodePointInner); /// Format the code point as `U+` followed by four to six hexadecimal digits. /// Example: `U+1F4A9` impl fmt::Debug for CodePoint { #[inline] fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(formatter, "U+{:04X}", self.value) + write!(formatter, "U+{:04X}", self.0.as_inner()) } } @@ -58,7 +52,8 @@ impl CodePoint { /// Only use when `value` is known to be less than or equal to 0x10FFFF. #[inline] pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { - CodePoint { value } + // SAFETY: Guaranteed by caller. + CodePoint(unsafe { CodePointInner::new_unchecked(value) }) } /// Creates a new `CodePoint` if the value is a valid code point. @@ -66,10 +61,7 @@ impl CodePoint { /// Returns `None` if `value` is above 0x10FFFF. #[inline] pub fn from_u32(value: u32) -> Option { - match value { - 0..=0x10FFFF => Some(CodePoint { value }), - _ => None, - } + Some(CodePoint(CodePointInner::new(value)?)) } /// Creates a new `CodePoint` from a `char`. @@ -77,19 +69,20 @@ impl CodePoint { /// Since all Unicode scalar values are code points, this always succeeds. #[inline] pub fn from_char(value: char) -> CodePoint { - CodePoint { value: value as u32 } + // SAFETY: All char are valid for this type. + unsafe { CodePoint::from_u32_unchecked(value as u32) } } /// Returns the numeric value of the code point. #[inline] pub fn to_u32(&self) -> u32 { - self.value + self.0.as_inner() } /// Returns the numeric value of the code point if it is a leading surrogate. #[inline] pub fn to_lead_surrogate(&self) -> Option { - match self.value { + match self.to_u32() { lead @ 0xD800..=0xDBFF => Some(lead as u16), _ => None, } @@ -98,7 +91,7 @@ impl CodePoint { /// Returns the numeric value of the code point if it is a trailing surrogate. #[inline] pub fn to_trail_surrogate(&self) -> Option { - match self.value { + match self.to_u32() { trail @ 0xDC00..=0xDFFF => Some(trail as u16), _ => None, } @@ -109,9 +102,10 @@ impl CodePoint { /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). #[inline] pub fn to_char(&self) -> Option { - match self.value { + match self.to_u32() { 0xD800..=0xDFFF => None, - _ => Some(unsafe { char::from_u32_unchecked(self.value) }), + // SAFETY: We explicitly check that the char is valid. + valid => Some(unsafe { char::from_u32_unchecked(valid) }), } } @@ -121,423 +115,7 @@ impl CodePoint { /// if the code point is a surrogate (from U+D800 to U+DFFF). #[inline] pub fn to_char_lossy(&self) -> char { - self.to_char().unwrap_or('\u{FFFD}') - } -} - -/// An owned, growable string of well-formed WTF-8 data. -/// -/// Similar to `String`, but can additionally contain surrogate code points -/// if they’re not in a surrogate pair. -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] -pub struct Wtf8Buf { - bytes: Vec, - - /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily - /// know this if we're constructed from a `String` or `&str`. - /// - /// It is possible for `bytes` to have valid UTF-8 without this being - /// set, such as when we're concatenating `&Wtf8`'s and surrogates become - /// paired, as we don't bother to rescan the entire string. - is_known_utf8: bool, -} - -impl ops::Deref for Wtf8Buf { - type Target = Wtf8; - - fn deref(&self) -> &Wtf8 { - self.as_slice() - } -} - -impl ops::DerefMut for Wtf8Buf { - fn deref_mut(&mut self) -> &mut Wtf8 { - self.as_mut_slice() - } -} - -/// Formats the string in double quotes, with characters escaped according to -/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`, -/// where each `x` is a hexadecimal digit. -/// -/// For example, the code units [U+0061, U+D800, U+000A] are formatted as -/// `"a\u{D800}\n"`. -impl fmt::Debug for Wtf8Buf { - #[inline] - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&**self, formatter) - } -} - -/// Formats the string with unpaired surrogates substituted with the replacement -/// character, U+FFFD. -impl fmt::Display for Wtf8Buf { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(s) = self.as_known_utf8() { - fmt::Display::fmt(s, formatter) - } else { - fmt::Display::fmt(&**self, formatter) - } - } -} - -impl Wtf8Buf { - /// Creates a new, empty WTF-8 string. - #[inline] - pub fn new() -> Wtf8Buf { - Wtf8Buf { bytes: Vec::new(), is_known_utf8: true } - } - - /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. - #[inline] - pub fn with_capacity(capacity: usize) -> Wtf8Buf { - Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true } - } - - /// Creates a WTF-8 string from a WTF-8 byte vec. - /// - /// Since the byte vec is not checked for valid WTF-8, this function is - /// marked unsafe. - #[inline] - pub unsafe fn from_bytes_unchecked(value: Vec) -> Wtf8Buf { - Wtf8Buf { bytes: value, is_known_utf8: false } - } - - /// Creates a WTF-8 string from a UTF-8 `String`. - /// - /// This takes ownership of the `String` and does not copy. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub const fn from_string(string: String) -> Wtf8Buf { - Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true } - } - - /// Creates a WTF-8 string from a UTF-8 `&str` slice. - /// - /// This copies the content of the slice. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub fn from_str(s: &str) -> Wtf8Buf { - Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: true } - } - - pub fn clear(&mut self) { - self.bytes.clear(); - self.is_known_utf8 = true; - } - - /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. - /// - /// This is lossless: calling `.encode_wide()` on the resulting string - /// will always return the original code units. - pub fn from_wide(v: &[u16]) -> Wtf8Buf { - let mut string = Wtf8Buf::with_capacity(v.len()); - for item in char::decode_utf16(v.iter().cloned()) { - match item { - Ok(ch) => string.push_char(ch), - Err(surrogate) => { - let surrogate = surrogate.unpaired_surrogate(); - // Surrogates are known to be in the code point range. - let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; - // The string will now contain an unpaired surrogate. - string.is_known_utf8 = false; - // Skip the WTF-8 concatenation check, - // surrogate pairs are already decoded by decode_utf16 - string.push_code_point_unchecked(code_point); - } - } - } - string - } - - /// Appends the given `char` to the end of this string. - /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. - /// Copied from String::push. - fn push_code_point_unchecked(&mut self, code_point: CodePoint) { - let mut bytes = [0; MAX_LEN_UTF8]; - let bytes = encode_utf8_raw(code_point.value, &mut bytes); - self.bytes.extend_from_slice(bytes) - } - - #[inline] - pub fn as_slice(&self) -> &Wtf8 { - unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } - } - - #[inline] - pub fn as_mut_slice(&mut self) -> &mut Wtf8 { - // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would - // cause them to change from well-formed UTF-8 to ill-formed UTF-8, - // which would break the assumptions of the `is_known_utf8` field. - unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } - } - - /// Converts the string to UTF-8 without validation, if it was created from - /// valid UTF-8. - #[inline] - fn as_known_utf8(&self) -> Option<&str> { - if self.is_known_utf8 { - // SAFETY: The buffer is known to be valid UTF-8. - Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) }) - } else { - None - } - } - - /// Reserves capacity for at least `additional` more bytes to be inserted - /// in the given `Wtf8Buf`. - /// The collection may reserve more space to avoid frequent reallocations. - /// - /// # Panics - /// - /// Panics if the new capacity exceeds `isize::MAX` bytes. - #[inline] - pub fn reserve(&mut self, additional: usize) { - self.bytes.reserve(additional) - } - - /// Tries to reserve capacity for at least `additional` more bytes to be - /// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to - /// avoid frequent reallocations. After calling `try_reserve`, capacity will - /// be greater than or equal to `self.len() + additional`. Does nothing if - /// capacity is already sufficient. This method preserves the contents even - /// if an error occurs. - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - #[inline] - pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.bytes.try_reserve(additional) - } - - #[inline] - pub fn reserve_exact(&mut self, additional: usize) { - self.bytes.reserve_exact(additional) - } - - /// Tries to reserve the minimum capacity for exactly `additional` more - /// bytes to be inserted in the given `Wtf8Buf`. After calling - /// `try_reserve_exact`, capacity will be greater than or equal to - /// `self.len() + additional` if it returns `Ok(())`. - /// Does nothing if the capacity is already sufficient. - /// - /// Note that the allocator may give the `Wtf8Buf` more space than it - /// requests. Therefore, capacity can not be relied upon to be precisely - /// minimal. Prefer [`try_reserve`] if future insertions are expected. - /// - /// [`try_reserve`]: Wtf8Buf::try_reserve - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - #[inline] - pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.bytes.try_reserve_exact(additional) - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - self.bytes.shrink_to_fit() - } - - #[inline] - pub fn shrink_to(&mut self, min_capacity: usize) { - self.bytes.shrink_to(min_capacity) - } - - #[inline] - pub fn leak<'a>(self) -> &'a mut Wtf8 { - unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) } - } - - /// Returns the number of bytes that this string buffer can hold without reallocating. - #[inline] - pub fn capacity(&self) -> usize { - self.bytes.capacity() - } - - /// Append a UTF-8 slice at the end of the string. - #[inline] - pub fn push_str(&mut self, other: &str) { - self.bytes.extend_from_slice(other.as_bytes()) - } - - /// Append a WTF-8 slice at the end of the string. - /// - /// This replaces newly paired surrogates at the boundary - /// with a supplementary code point, - /// like concatenating ill-formed UTF-16 strings effectively would. - #[inline] - pub fn push_wtf8(&mut self, other: &Wtf8) { - match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { - // Replace newly paired surrogates by a supplementary code point. - (Some(lead), Some(trail)) => { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - let other_without_trail_surrogate = &other.bytes[3..]; - // 4 bytes for the supplementary code point - self.bytes.reserve(4 + other_without_trail_surrogate.len()); - self.push_char(decode_surrogate_pair(lead, trail)); - self.bytes.extend_from_slice(other_without_trail_surrogate); - } - _ => { - // If we'll be pushing a string containing a surrogate, we may - // no longer have UTF-8. - if self.is_known_utf8 && other.next_surrogate(0).is_some() { - self.is_known_utf8 = false; - } - - self.bytes.extend_from_slice(&other.bytes); - } - } - } - - /// Append a Unicode scalar value at the end of the string. - #[inline] - pub fn push_char(&mut self, c: char) { - self.push_code_point_unchecked(CodePoint::from_char(c)) - } - - /// Append a code point at the end of the string. - /// - /// This replaces newly paired surrogates at the boundary - /// with a supplementary code point, - /// like concatenating ill-formed UTF-16 strings effectively would. - #[inline] - pub fn push(&mut self, code_point: CodePoint) { - if let Some(trail) = code_point.to_trail_surrogate() { - if let Some(lead) = (&*self).final_lead_surrogate() { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - self.push_char(decode_surrogate_pair(lead, trail)); - return; - } - - // We're pushing a trailing surrogate. - self.is_known_utf8 = false; - } else if code_point.to_lead_surrogate().is_some() { - // We're pushing a leading surrogate. - self.is_known_utf8 = false; - } - - // No newly paired surrogates at the boundary. - self.push_code_point_unchecked(code_point) - } - - /// Shortens a string to the specified length. - /// - /// # Panics - /// - /// Panics if `new_len` > current length, - /// or if `new_len` is not a code point boundary. - #[inline] - pub fn truncate(&mut self, new_len: usize) { - assert!(is_code_point_boundary(self, new_len)); - self.bytes.truncate(new_len) - } - - /// Consumes the WTF-8 string and tries to convert it to a vec of bytes. - #[inline] - pub fn into_bytes(self) -> Vec { - self.bytes - } - - /// Consumes the WTF-8 string and tries to convert it to UTF-8. - /// - /// This does not copy the data. - /// - /// If the contents are not well-formed UTF-8 - /// (that is, if the string contains surrogates), - /// the original WTF-8 string is returned instead. - pub fn into_string(self) -> Result { - if self.is_known_utf8 || self.next_surrogate(0).is_none() { - Ok(unsafe { String::from_utf8_unchecked(self.bytes) }) - } else { - Err(self) - } - } - - /// Consumes the WTF-8 string and converts it lossily to UTF-8. - /// - /// This does not copy the data (but may overwrite parts of it in place). - /// - /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) - pub fn into_string_lossy(mut self) -> String { - if !self.is_known_utf8 { - let mut pos = 0; - while let Some((surrogate_pos, _)) = self.next_surrogate(pos) { - pos = surrogate_pos + 3; - // Surrogates and the replacement character are all 3 bytes, so - // they can substituted in-place. - self.bytes[surrogate_pos..pos] - .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - } - } - unsafe { String::from_utf8_unchecked(self.bytes) } - } - - /// Converts this `Wtf8Buf` into a boxed `Wtf8`. - #[inline] - pub fn into_box(self) -> Box { - // SAFETY: relies on `Wtf8` being `repr(transparent)`. - unsafe { mem::transmute(self.bytes.into_boxed_slice()) } - } - - /// Converts a `Box` into a `Wtf8Buf`. - pub fn from_box(boxed: Box) -> Wtf8Buf { - let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; - Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false } - } - - /// Provides plumbing to core `Vec::extend_from_slice`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { - self.bytes.extend_from_slice(other); - self.is_known_utf8 = false; - } -} - -/// Creates a new WTF-8 string from an iterator of code points. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl FromIterator for Wtf8Buf { - fn from_iter>(iter: T) -> Wtf8Buf { - let mut string = Wtf8Buf::new(); - string.extend(iter); - string - } -} - -/// Append code points from an iterator to the string. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl Extend for Wtf8Buf { - fn extend>(&mut self, iter: T) { - let iterator = iter.into_iter(); - let (low, _high) = iterator.size_hint(); - // Lower bound of one byte per code point (ASCII only) - self.bytes.reserve(low); - iterator.for_each(move |code_point| self.push(code_point)); - } - - #[inline] - fn extend_one(&mut self, code_point: CodePoint) { - self.push(code_point); - } - - #[inline] - fn extend_reserve(&mut self, additional: usize) { - // Lower bound of one byte per code point (ASCII only) - self.bytes.reserve(additional); + self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER) } } @@ -547,13 +125,15 @@ impl Extend for Wtf8Buf { /// if they’re not in a surrogate pair. #[derive(Eq, Ord, PartialEq, PartialOrd)] #[repr(transparent)] +#[rustc_has_incoherent_inherent_impls] +#[doc(hidden)] pub struct Wtf8 { bytes: [u8], } -impl AsInner<[u8]> for Wtf8 { +impl AsRef<[u8]> for Wtf8 { #[inline] - fn as_inner(&self) -> &[u8] { + fn as_ref(&self) -> &[u8] { &self.bytes } } @@ -574,12 +154,15 @@ impl fmt::Debug for Wtf8 { formatter.write_str("\"")?; let mut pos = 0; while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { + // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes. write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) })?; write!(formatter, "\\u{{{:x}}}", surrogate)?; pos = surrogate_pos + 3; } + + // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8. write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; formatter.write_str("\"") } @@ -594,13 +177,15 @@ impl fmt::Display for Wtf8 { loop { match self.next_surrogate(pos) { Some((surrogate_pos, _)) => { + // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes. formatter.write_str(unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) })?; - formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; + formatter.write_char(char::REPLACEMENT_CHARACTER)?; pos = surrogate_pos + 3; } None => { + // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8. let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) } } @@ -611,10 +196,9 @@ impl fmt::Display for Wtf8 { impl Wtf8 { /// Creates a WTF-8 slice from a UTF-8 `&str` slice. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. #[inline] pub fn from_str(value: &str) -> &Wtf8 { + // SAFETY: Since WTF-8 is a superset of UTF-8, this always is valid. unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } } @@ -633,7 +217,7 @@ impl Wtf8 { /// Since the byte slice is not checked for valid WTF-8, this functions is /// marked unsafe. #[inline] - unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { + pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { // SAFETY: start with &mut [u8], end with fancy &mut [u8] unsafe { &mut *(value as *mut [u8] as *mut Wtf8) } } @@ -685,41 +269,6 @@ impl Wtf8 { str::from_utf8(&self.bytes) } - /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. - pub fn to_owned(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false } - } - - /// Lossily converts the string to UTF-8. - /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. - /// - /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). - /// - /// This only copies the data if necessary (if it contains any surrogate). - pub fn to_string_lossy(&self) -> Cow<'_, str> { - let Some((surrogate_pos, _)) = self.next_surrogate(0) else { - return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }); - }; - let wtf8_bytes = &self.bytes; - let mut utf8_bytes = Vec::with_capacity(self.len()); - utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - let mut pos = surrogate_pos + 3; - loop { - match self.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - pos = surrogate_pos + 3; - } - None => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); - return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); - } - } - } - } - /// Converts the WTF-8 string to potentially ill-formed UTF-16 /// and return an iterator of 16-bit code units. /// @@ -732,7 +281,7 @@ impl Wtf8 { } #[inline] - fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { + pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { let mut iter = self.bytes[pos..].iter(); loop { let b = *iter.next()?; @@ -762,7 +311,7 @@ impl Wtf8 { } #[inline] - fn final_lead_surrogate(&self) -> Option { + pub fn final_lead_surrogate(&self) -> Option { match self.bytes { [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), _ => None, @@ -770,43 +319,13 @@ impl Wtf8 { } #[inline] - fn initial_trail_surrogate(&self) -> Option { + pub fn initial_trail_surrogate(&self) -> Option { match self.bytes { [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), _ => None, } } - pub fn clone_into(&self, buf: &mut Wtf8Buf) { - buf.is_known_utf8 = false; - self.bytes.clone_into(&mut buf.bytes); - } - - /// Boxes this `Wtf8`. - #[inline] - pub fn into_box(&self) -> Box { - let boxed: Box<[u8]> = self.bytes.into(); - unsafe { mem::transmute(boxed) } - } - - /// Creates a boxed, empty `Wtf8`. - pub fn empty_box() -> Box { - let boxed: Box<[u8]> = Default::default(); - unsafe { mem::transmute(boxed) } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - let arc: Arc<[u8]> = Arc::from(&self.bytes); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } - } - - #[inline] - pub fn into_rc(&self) -> Rc { - let rc: Rc<[u8]> = Rc::from(&self.bytes); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } - } - #[inline] pub fn make_ascii_lowercase(&mut self) { self.bytes.make_ascii_lowercase() @@ -818,23 +337,23 @@ impl Wtf8 { } #[inline] - pub fn to_ascii_lowercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false } + pub fn is_ascii(&self) -> bool { + self.bytes.is_ascii() } #[inline] - pub fn to_ascii_uppercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false } + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + self.bytes.eq_ignore_ascii_case(&other.bytes) } #[inline] - pub fn is_ascii(&self) -> bool { - self.bytes.is_ascii() + pub fn is_code_point_boundary(&self, index: usize) -> bool { + is_code_point_boundary(self, index) } #[inline] - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.bytes.eq_ignore_ascii_case(&other.bytes) + pub fn check_utf8_boundary(&self, index: usize) { + check_utf8_boundary(self, index) } } @@ -849,11 +368,11 @@ impl ops::Index> for Wtf8 { #[inline] fn index(&self, range: ops::Range) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] if range.start <= range.end - && is_code_point_boundary(self, range.start) - && is_code_point_boundary(self, range.end) + && self.is_code_point_boundary(range.start) + && self.is_code_point_boundary(range.end) { + // SAFETY: is_code_point_boundary checks that the index is valid unsafe { slice_unchecked(self, range.start, range.end) } } else { slice_error_fail(self, range.start, range.end) @@ -872,8 +391,8 @@ impl ops::Index> for Wtf8 { #[inline] fn index(&self, range: ops::RangeFrom) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.start) { + if self.is_code_point_boundary(range.start) { + // SAFETY: is_code_point_boundary checks that the index is valid unsafe { slice_unchecked(self, range.start, self.len()) } } else { slice_error_fail(self, range.start, self.len()) @@ -892,8 +411,8 @@ impl ops::Index> for Wtf8 { #[inline] fn index(&self, range: ops::RangeTo) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.end) { + if self.is_code_point_boundary(range.end) { + // SAFETY: is_code_point_boundary checks that the index is valid unsafe { slice_unchecked(self, 0, range.end) } } else { slice_error_fail(self, 0, range.end) @@ -916,11 +435,7 @@ fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F } -#[inline] -fn decode_surrogate_pair(lead: u16, trail: u16) -> char { - let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); - unsafe { char::from_u32_unchecked(code_point) } -} +// helps diff to be unindented /// Copied from str::is_char_boundary #[inline] @@ -964,7 +479,7 @@ pub fn check_utf8_boundary(slice: &Wtf8, index: usize) { /// Copied from core::str::raw::slice_unchecked #[inline] -pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { +unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { // SAFETY: memory layout of a &[u8] and &Wtf8 are the same unsafe { let len = end - begin; @@ -975,7 +490,7 @@ pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { /// Copied from core::str::raw::slice_error_fail #[inline(never)] -pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { +fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { assert!(begin <= end); panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); } @@ -984,6 +499,7 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { /// /// Created with the method `.code_points()`. #[derive(Clone)] +#[doc(hidden)] pub struct Wtf8CodePoints<'a> { bytes: slice::Iter<'a, u8>, } @@ -994,7 +510,7 @@ impl Iterator for Wtf8CodePoints<'_> { #[inline] fn next(&mut self) -> Option { // SAFETY: `self.bytes` has been created from a WTF-8 string - unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } + unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint::from_u32_unchecked(c)) } } #[inline] @@ -1004,9 +520,19 @@ impl Iterator for Wtf8CodePoints<'_> { } } +impl fmt::Debug for Wtf8CodePoints<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Wtf8CodePoints") + // SAFETY: We always leave the string in a valid state after each iteration. + .field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) }) + .finish() + } +} + /// Generates a wide character sequence for potentially ill-formed UTF-16. #[stable(feature = "rust1", since = "1.0.0")] #[derive(Clone)] +#[doc(hidden)] pub struct EncodeWide<'a> { code_points: Wtf8CodePoints<'a>, extra: u16, @@ -1027,7 +553,7 @@ impl Iterator for EncodeWide<'_> { let mut buf = [0; MAX_LEN_UTF16]; self.code_points.next().map(|code_point| { - let n = encode_utf16_raw(code_point.value, &mut buf).len(); + let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len(); if n == 2 { self.extra = buf[1]; } @@ -1046,21 +572,19 @@ impl Iterator for EncodeWide<'_> { } } +impl fmt::Debug for EncodeWide<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("EncodeWide").finish_non_exhaustive() + } +} + #[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")] impl FusedIterator for EncodeWide<'_> {} impl Hash for CodePoint { #[inline] fn hash(&self, state: &mut H) { - self.value.hash(state) - } -} - -impl Hash for Wtf8Buf { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) + self.0.hash(state) } } diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index d2281b1df2f..4415b614817 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -194,6 +194,7 @@ mod time; mod tuple; mod unicode; mod waker; +mod wtf8; /// Copied from `std::test_helpers::test_rng`, see that function for rationale. #[track_caller] diff --git a/library/coretests/tests/wtf8.rs b/library/coretests/tests/wtf8.rs new file mode 100644 index 00000000000..9f187e70630 --- /dev/null +++ b/library/coretests/tests/wtf8.rs @@ -0,0 +1 @@ +// All `wtf8` tests live in library/alloctests/tests/wtf8.rs diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index ab417b6c72f..ca76dcc5147 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -391,6 +391,7 @@ #![feature(try_with_capacity)] #![feature(unique_rc_arc)] #![feature(vec_into_raw_parts)] +#![feature(wtf8_internals)] // tidy-alphabetical-end // // Library features (unwind): diff --git a/library/std/src/os/windows/ffi.rs b/library/std/src/os/windows/ffi.rs index 496443dbbc3..345d5b74285 100644 --- a/library/std/src/os/windows/ffi.rs +++ b/library/std/src/os/windows/ffi.rs @@ -53,12 +53,13 @@ #![stable(feature = "rust1", since = "1.0.0")] +use alloc::wtf8::Wtf8Buf; + use crate::ffi::{OsStr, OsString}; +use crate::fmt; +use crate::iter::FusedIterator; use crate::sealed::Sealed; use crate::sys::os_str::Buf; -#[stable(feature = "rust1", since = "1.0.0")] -pub use crate::sys_common::wtf8::EncodeWide; -use crate::sys_common::wtf8::Wtf8Buf; use crate::sys_common::{AsInner, FromInner}; /// Windows-specific extensions to [`OsString`]. @@ -130,6 +131,35 @@ pub trait OsStrExt: Sealed { impl OsStrExt for OsStr { #[inline] fn encode_wide(&self) -> EncodeWide<'_> { - self.as_inner().inner.encode_wide() + EncodeWide { inner: self.as_inner().inner.encode_wide() } + } +} + +/// Iterator returned by [`OsStrExt::encode_wide`]. +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Clone)] +pub struct EncodeWide<'a> { + inner: alloc::wtf8::EncodeWide<'a>, +} +#[stable(feature = "encode_wide_debug", since = "CURRENT_RUSTC_VERSION")] +impl fmt::Debug for EncodeWide<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.inner, f) + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for EncodeWide<'_> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() } } +#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")] +impl FusedIterator for EncodeWide<'_> {} diff --git a/library/std/src/sys/os_str/wtf8.rs b/library/std/src/sys/os_str/wtf8.rs index bbc704ebf86..96da891874e 100644 --- a/library/std/src/sys/os_str/wtf8.rs +++ b/library/std/src/sys/os_str/wtf8.rs @@ -1,12 +1,12 @@ //! The underlying OsString/OsStr implementation on Windows is a //! wrapper around the "WTF-8" encoding; see the `wtf8` module for more. +use alloc::wtf8::{Wtf8, Wtf8Buf}; use core::clone::CloneToUninit; use crate::borrow::Cow; use crate::collections::TryReserveError; use crate::rc::Rc; use crate::sync::Arc; -use crate::sys_common::wtf8::{Wtf8, Wtf8Buf, check_utf8_boundary}; use crate::sys_common::{AsInner, FromInner, IntoInner}; use crate::{fmt, mem}; @@ -220,7 +220,9 @@ impl Buf { /// trailing surrogate half. #[inline] pub unsafe fn extend_from_slice_unchecked(&mut self, other: &[u8]) { - self.inner.extend_from_slice(other); + unsafe { + self.inner.extend_from_slice_unchecked(other); + } } } @@ -238,7 +240,7 @@ impl Slice { #[track_caller] #[inline] pub fn check_public_boundary(&self, index: usize) { - check_utf8_boundary(&self.inner, index); + self.inner.check_utf8_boundary(index); } #[inline] diff --git a/library/std/src/sys_common/mod.rs b/library/std/src/sys_common/mod.rs index 24b6cff1309..ec45c723e0d 100644 --- a/library/std/src/sys_common/mod.rs +++ b/library/std/src/sys_common/mod.rs @@ -21,7 +21,6 @@ mod tests; pub mod wstr; -pub mod wtf8; // common error constructors diff --git a/src/tools/tidy/src/unit_tests.rs b/src/tools/tidy/src/unit_tests.rs index 3d14a467319..7396310ed37 100644 --- a/src/tools/tidy/src/unit_tests.rs +++ b/src/tools/tidy/src/unit_tests.rs @@ -61,6 +61,7 @@ pub fn check(root_path: &Path, stdlib: bool, bad: &mut bool) { || path.ends_with("library/alloc/src/collections/linked_list/tests.rs") || path.ends_with("library/alloc/src/collections/vec_deque/tests.rs") || path.ends_with("library/alloc/src/raw_vec/tests.rs") + || path.ends_with("library/alloc/src/wtf8/tests.rs") } }; -- cgit 1.4.1-3-g733a5 From 07aff76dda8fe022052de2ad7624df7de3306529 Mon Sep 17 00:00:00 2001 From: The Miri Cronjob Bot Date: Thu, 21 Aug 2025 04:53:54 +0000 Subject: Prepare for merging from rust-lang/rust This updates the rust-version file to 125ff8a788c5d6a66917f499abdc00051afe6886. --- src/tools/miri/rust-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version index 59adc572eaa..85ce9ed79f4 100644 --- a/src/tools/miri/rust-version +++ b/src/tools/miri/rust-version @@ -1 +1 @@ -f605b57042ffeb320d7ae44490113a827139b766 +125ff8a788c5d6a66917f499abdc00051afe6886 -- cgit 1.4.1-3-g733a5 From 70e7c058a0c8848a4ae3f4736b2a979e821a8e78 Mon Sep 17 00:00:00 2001 From: lumiscosity Date: Tue, 19 Aug 2025 11:15:20 +0200 Subject: Losslessly optimize PNG files Losslessly optimizes all of the PNG files in the repo. Done with: ``` oxipng -o max -a -s oxipng -o max --zopfli -a -s ``` --- src/doc/rustc/src/images/image1.png | Bin 164896 -> 112780 bytes src/doc/rustc/src/images/image2.png | Bin 155307 -> 107858 bytes src/doc/rustc/src/images/image3.png | Bin 19936 -> 15559 bytes src/doc/rustc/src/images/llvm-cov-show-01.png | Bin 416748 -> 206904 bytes src/doc/rustdoc/src/images/collapsed-long-item.png | Bin 17017 -> 11156 bytes .../rustdoc/src/images/collapsed-trait-impls.png | Bin 44225 -> 31081 bytes src/etc/installer/gfx/rust-logo.png | Bin 3909 -> 3261 bytes .../html/static/images/favicon-32x32.png | Bin 1125 -> 690 bytes 8 files changed, 0 insertions(+), 0 deletions(-) (limited to 'src') diff --git a/src/doc/rustc/src/images/image1.png b/src/doc/rustc/src/images/image1.png index 0da45e56620..3aad6359389 100644 Binary files a/src/doc/rustc/src/images/image1.png and b/src/doc/rustc/src/images/image1.png differ diff --git a/src/doc/rustc/src/images/image2.png b/src/doc/rustc/src/images/image2.png index a9cf23f8737..085b1c490b8 100644 Binary files a/src/doc/rustc/src/images/image2.png and b/src/doc/rustc/src/images/image2.png differ diff --git a/src/doc/rustc/src/images/image3.png b/src/doc/rustc/src/images/image3.png index 844a2fe6747..ee332f51055 100644 Binary files a/src/doc/rustc/src/images/image3.png and b/src/doc/rustc/src/images/image3.png differ diff --git a/src/doc/rustc/src/images/llvm-cov-show-01.png b/src/doc/rustc/src/images/llvm-cov-show-01.png index 35f04594347..ce4dec128b6 100644 Binary files a/src/doc/rustc/src/images/llvm-cov-show-01.png and b/src/doc/rustc/src/images/llvm-cov-show-01.png differ diff --git a/src/doc/rustdoc/src/images/collapsed-long-item.png b/src/doc/rustdoc/src/images/collapsed-long-item.png index c382870c64a..6de759fbeb9 100644 Binary files a/src/doc/rustdoc/src/images/collapsed-long-item.png and b/src/doc/rustdoc/src/images/collapsed-long-item.png differ diff --git a/src/doc/rustdoc/src/images/collapsed-trait-impls.png b/src/doc/rustdoc/src/images/collapsed-trait-impls.png index f685656e09a..96cc7db6798 100644 Binary files a/src/doc/rustdoc/src/images/collapsed-trait-impls.png and b/src/doc/rustdoc/src/images/collapsed-trait-impls.png differ diff --git a/src/etc/installer/gfx/rust-logo.png b/src/etc/installer/gfx/rust-logo.png index 99ee7507fa2..49d8d0d9485 100644 Binary files a/src/etc/installer/gfx/rust-logo.png and b/src/etc/installer/gfx/rust-logo.png differ diff --git a/src/librustdoc/html/static/images/favicon-32x32.png b/src/librustdoc/html/static/images/favicon-32x32.png index 69b8613ce15..0670c4dabb0 100644 Binary files a/src/librustdoc/html/static/images/favicon-32x32.png and b/src/librustdoc/html/static/images/favicon-32x32.png differ -- cgit 1.4.1-3-g733a5 From f702219ba61d94a50ecb581a4e0ab51dc459e99e Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Thu, 21 Aug 2025 16:12:43 +0200 Subject: update rustc-build-sysroot --- src/tools/miri/cargo-miri/Cargo.lock | 4 ++-- src/tools/miri/cargo-miri/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/tools/miri/cargo-miri/Cargo.lock b/src/tools/miri/cargo-miri/Cargo.lock index b3f5dafab64..ea9c04a3cb5 100644 --- a/src/tools/miri/cargo-miri/Cargo.lock +++ b/src/tools/miri/cargo-miri/Cargo.lock @@ -429,9 +429,9 @@ dependencies = [ [[package]] name = "rustc-build-sysroot" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb13874a0e55baf4ac3d49d38206aecb31a55b75d6c4d04fd850b53942c8cc8" +checksum = "dd41ead66a69880951b2f7df3139db401d44451b4da123344d27eaa791b89c95" dependencies = [ "anyhow", "rustc_version", diff --git a/src/tools/miri/cargo-miri/Cargo.toml b/src/tools/miri/cargo-miri/Cargo.toml index 77cb1df8e74..64b56ea114e 100644 --- a/src/tools/miri/cargo-miri/Cargo.toml +++ b/src/tools/miri/cargo-miri/Cargo.toml @@ -18,7 +18,7 @@ directories = "6" rustc_version = "0.4" serde_json = "1.0.40" cargo_metadata = "0.21" -rustc-build-sysroot = "0.5.8" +rustc-build-sysroot = "0.5.10" # Enable some feature flags that dev-dependencies need but dependencies # do not. This makes `./miri install` after `./miri build` faster. -- cgit 1.4.1-3-g733a5 From 0a5383c35a56886a6638832862132eafb288f94b Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Wed, 20 Aug 2025 14:39:50 -0700 Subject: Promote aarch64-pc-windows-msvc to Tier 1 --- compiler/rustc_target/src/spec/targets/aarch64_pc_windows_msvc.rs | 2 +- src/bootstrap/src/core/build_steps/llvm.rs | 2 +- src/doc/rustc/src/platform-support.md | 2 +- src/doc/rustc/src/platform-support/windows-msvc.md | 5 +---- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/compiler/rustc_target/src/spec/targets/aarch64_pc_windows_msvc.rs b/compiler/rustc_target/src/spec/targets/aarch64_pc_windows_msvc.rs index f1b6fa123de..cd55576ef81 100644 --- a/compiler/rustc_target/src/spec/targets/aarch64_pc_windows_msvc.rs +++ b/compiler/rustc_target/src/spec/targets/aarch64_pc_windows_msvc.rs @@ -15,7 +15,7 @@ pub(crate) fn target() -> Target { llvm_target: "aarch64-pc-windows-msvc".into(), metadata: TargetMetadata { description: Some("ARM64 Windows MSVC".into()), - tier: Some(2), + tier: Some(1), host_tools: Some(true), std: Some(true), }, diff --git a/src/bootstrap/src/core/build_steps/llvm.rs b/src/bootstrap/src/core/build_steps/llvm.rs index 260108292e0..303b185919e 100644 --- a/src/bootstrap/src/core/build_steps/llvm.rs +++ b/src/bootstrap/src/core/build_steps/llvm.rs @@ -205,6 +205,7 @@ pub(crate) fn is_ci_llvm_available_for_target( // tier 1 ("aarch64-unknown-linux-gnu", false), ("aarch64-apple-darwin", false), + ("aarch64-pc-windows-msvc", false), ("i686-pc-windows-gnu", false), ("i686-pc-windows-msvc", false), ("i686-unknown-linux-gnu", false), @@ -213,7 +214,6 @@ pub(crate) fn is_ci_llvm_available_for_target( ("x86_64-pc-windows-gnu", true), ("x86_64-pc-windows-msvc", true), // tier 2 with host tools - ("aarch64-pc-windows-msvc", false), ("aarch64-unknown-linux-musl", false), ("arm-unknown-linux-gnueabi", false), ("arm-unknown-linux-gnueabihf", false), diff --git a/src/doc/rustc/src/platform-support.md b/src/doc/rustc/src/platform-support.md index c039517a970..13f03ac7c42 100644 --- a/src/doc/rustc/src/platform-support.md +++ b/src/doc/rustc/src/platform-support.md @@ -33,6 +33,7 @@ All tier 1 targets with host tools support the full standard library. target | notes -------|------- [`aarch64-apple-darwin`](platform-support/apple-darwin.md) | ARM64 macOS (11.0+, Big Sur+) +[`aarch64-pc-windows-msvc`](platform-support/windows-msvc.md) | ARM64 Windows MSVC `aarch64-unknown-linux-gnu` | ARM64 Linux (kernel 4.1+, glibc 2.17+) [`i686-pc-windows-msvc`](platform-support/windows-msvc.md) | 32-bit MSVC (Windows 10+, Windows Server 2016+, Pentium 4) [^x86_32-floats-return-ABI] [^win32-msvc-alignment] `i686-unknown-linux-gnu` | 32-bit Linux (kernel 3.2+, glibc 2.17+, Pentium 4) [^x86_32-floats-return-ABI] @@ -88,7 +89,6 @@ so Rustup may install the documentation for a similar tier 1 target instead. target | notes -------|------- [`aarch64-pc-windows-gnullvm`](platform-support/windows-gnullvm.md) | ARM64 MinGW (Windows 10+), LLVM ABI -[`aarch64-pc-windows-msvc`](platform-support/windows-msvc.md) | ARM64 Windows MSVC [`aarch64-unknown-linux-musl`](platform-support/aarch64-unknown-linux-musl.md) | ARM64 Linux with musl 1.2.3 [`aarch64-unknown-linux-ohos`](platform-support/openharmony.md) | ARM64 OpenHarmony `arm-unknown-linux-gnueabi` | Armv6 Linux (kernel 3.2+, glibc 2.17) diff --git a/src/doc/rustc/src/platform-support/windows-msvc.md b/src/doc/rustc/src/platform-support/windows-msvc.md index 71dc4ddc2e6..826c75b79c5 100644 --- a/src/doc/rustc/src/platform-support/windows-msvc.md +++ b/src/doc/rustc/src/platform-support/windows-msvc.md @@ -4,13 +4,10 @@ Windows MSVC targets. **Tier 1 with host tools:** +- `aarch64-pc-windows-msvc`: Windows on ARM64. - `i686-pc-windows-msvc`: Windows on 32-bit x86. - `x86_64-pc-windows-msvc`: Windows on 64-bit x86. -**Tier 2 with host tools:** - -- `aarch64-pc-windows-msvc`: Windows on ARM64. - ## Target maintainers [@ChrisDenton](https://github.com/ChrisDenton) -- cgit 1.4.1-3-g733a5 From 5c3f317187c86fe06cd40a81b7072323a2f8ec55 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Thu, 21 Aug 2025 21:20:28 +0200 Subject: CI: also test on powerpc --- src/tools/miri/.github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src') diff --git a/src/tools/miri/.github/workflows/ci.yml b/src/tools/miri/.github/workflows/ci.yml index 7d79c384f85..c0fed96d4e6 100644 --- a/src/tools/miri/.github/workflows/ci.yml +++ b/src/tools/miri/.github/workflows/ci.yml @@ -41,6 +41,11 @@ jobs: multiarch: s390x gcc_cross: s390x-linux-gnu qemu: true + - host_target: powerpc64le-unknown-linux-gnu + os: ubuntu-latest + multiarch: ppc64el + gcc_cross: powerpc64le-linux-gnu + qemu: true - host_target: aarch64-apple-darwin os: macos-latest - host_target: i686-pc-windows-msvc -- cgit 1.4.1-3-g733a5 From 15a8999aedc0e3a0eb7b4e956dd79b3a2f28f2a3 Mon Sep 17 00:00:00 2001 From: binarycat Date: Sun, 17 Aug 2025 11:23:19 -0500 Subject: refactor rustdoc::invalid_html_tags tag parser previously, this lint did not distinguish between ``, and since the latter should be accepted under html5, the former was also accepted. the parser now also handles multi-line tags and multi-line attributes. --- src/librustdoc/lib.rs | 1 + src/librustdoc/passes/lint/html_tags.rs | 476 +++++++++++++++--------- src/librustdoc/passes/lint/html_tags/tests.rs | 73 ++++ tests/rustdoc-ui/lints/invalid-html-tags.rs | 67 +++- tests/rustdoc-ui/lints/invalid-html-tags.stderr | 69 +++- 5 files changed, 498 insertions(+), 188 deletions(-) create mode 100644 src/librustdoc/passes/lint/html_tags/tests.rs (limited to 'src') diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs index 28dbd8ba7d3..62e1ad2444d 100644 --- a/src/librustdoc/lib.rs +++ b/src/librustdoc/lib.rs @@ -11,6 +11,7 @@ #![feature(file_buffered)] #![feature(format_args_nl)] #![feature(if_let_guard)] +#![feature(iter_advance_by)] #![feature(iter_intersperse)] #![feature(round_char_boundary)] #![feature(rustc_private)] diff --git a/src/librustdoc/passes/lint/html_tags.rs b/src/librustdoc/passes/lint/html_tags.rs index 19cf15d40a3..b9c9279daec 100644 --- a/src/librustdoc/passes/lint/html_tags.rs +++ b/src/librustdoc/passes/lint/html_tags.rs @@ -1,9 +1,11 @@ //! Detects invalid HTML (like an unclosed ``) in doc comments. +use std::borrow::Cow; use std::iter::Peekable; use std::ops::Range; use std::str::CharIndices; +use itertools::Itertools as _; use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd}; use rustc_hir::HirId; use rustc_resolve::rustdoc::source_span_for_markdown_range; @@ -101,7 +103,7 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: & }); }; - let mut tags = Vec::new(); + let mut tagp = TagParser::new(); let mut is_in_comment = None; let mut in_code_block = false; @@ -126,70 +128,65 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: & }; let p = Parser::new_with_broken_link_callback(dox, main_body_opts(), Some(&mut replacer)) - .into_offset_iter(); + .into_offset_iter() + .coalesce(|a, b| { + // for some reason, pulldown-cmark splits html blocks into separate events for each line. + // we undo this, in order to handle multi-line tags. + match (a, b) { + ((Event::Html(_), ra), (Event::Html(_), rb)) if ra.end == rb.start => { + let merged = ra.start..rb.end; + Ok((Event::Html(Cow::Borrowed(&dox[merged.clone()]).into()), merged)) + } + x => Err(x), + } + }); for (event, range) in p { match event { Event::Start(Tag::CodeBlock(_)) => in_code_block = true, Event::Html(text) | Event::InlineHtml(text) if !in_code_block => { - extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag) + tagp.extract_tags(&text, range, &mut is_in_comment, &report_diag) } Event::End(TagEnd::CodeBlock) => in_code_block = false, _ => {} } } - for (tag, range) in tags.iter().filter(|(t, _)| { - let t = t.to_lowercase(); - !ALLOWED_UNCLOSED.contains(&t.as_str()) - }) { - report_diag(format!("unclosed HTML tag `{tag}`"), range, true); - } - if let Some(range) = is_in_comment { report_diag("Unclosed HTML comment".to_string(), &range, false); + } else if let &Some(quote_pos) = &tagp.quote_pos { + let qr = Range { start: quote_pos, end: quote_pos }; + report_diag( + format!("unclosed quoted HTML attribute on tag `{}`", &tagp.tag_name), + &qr, + false, + ); + } else { + if !tagp.tag_name.is_empty() { + report_diag( + format!("incomplete HTML tag `{}`", &tagp.tag_name), + &(tagp.tag_start_pos..dox.len()), + false, + ); + } + for (tag, range) in tagp.tags.iter().filter(|(t, _)| { + let t = t.to_lowercase(); + !is_implicitly_self_closing(&t) + }) { + report_diag(format!("unclosed HTML tag `{tag}`"), range, true); + } } } +/// These tags are interpreted as self-closing if they lack an explicit closing tag. const ALLOWED_UNCLOSED: &[&str] = &[ "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr", ]; -fn drop_tag( - tags: &mut Vec<(String, Range)>, - tag_name: String, - range: Range, - f: &impl Fn(String, &Range, bool), -) { - let tag_name_low = tag_name.to_lowercase(); - if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) { - // If the tag is nested inside a "` (the `h2` tag isn't required - // but it helps for the visualization). - f(format!("unopened HTML tag `{tag_name}`"), &range, false); - } +/// Allows constructs like ``, but not ` bool { + ALLOWED_UNCLOSED.contains(&tag_name) } fn extract_path_backwards(text: &str, end_pos: usize) -> Option { @@ -252,151 +249,292 @@ fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool { c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit()) } -fn extract_html_tag( - tags: &mut Vec<(String, Range)>, - text: &str, - range: &Range, - start_pos: usize, - iter: &mut Peekable>, - f: &impl Fn(String, &Range, bool), -) { - let mut tag_name = String::new(); - let mut is_closing = false; - let mut prev_pos = start_pos; +/// Parse html tags to ensure they are well-formed +#[derive(Debug, Clone)] +struct TagParser { + tags: Vec<(String, Range)>, + /// Name of the tag that is being parsed, if we are within a tag. + /// + /// Since the `<` and name of a tag must appear on the same line with no whitespace, + /// if this is the empty string, we are not in a tag. + tag_name: String, + tag_start_pos: usize, + is_closing: bool, + /// `true` if we are within a tag, but not within its name. + in_attrs: bool, + /// If we are in a quoted attribute, what quote char does it use? + /// + /// This needs to be stored in the struct since HTML5 allows newlines in quoted attrs. + quote: Option, + quote_pos: Option, + after_eq: bool, +} - loop { - let (pos, c) = match iter.peek() { - Some((pos, c)) => (*pos, *c), - // In case we reached the of the doc comment, we want to check that it's an - // unclosed HTML tag. For example "/// (prev_pos, '\0'), - }; - prev_pos = pos; - // Checking if this is a closing tag (like `` for ``). - if c == '/' && tag_name.is_empty() { - is_closing = true; - } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) { - tag_name.push(c); - } else { - if !tag_name.is_empty() { - let mut r = Range { start: range.start + start_pos, end: range.start + pos }; - if c == '>' { - // In case we have a tag without attribute, we can consider the span to - // refer to it fully. - r.end += 1; +impl TagParser { + fn new() -> Self { + Self { + tags: Vec::new(), + tag_name: String::with_capacity(8), + tag_start_pos: 0, + is_closing: false, + in_attrs: false, + quote: None, + quote_pos: None, + after_eq: false, + } + } + + fn drop_tag(&mut self, range: Range, f: &impl Fn(String, &Range, bool)) { + let tag_name_low = self.tag_name.to_lowercase(); + if let Some(pos) = self.tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) { + // If the tag is nested inside a "` (the `h2` tag isn't required + // but it helps for the visualization). + f(format!("unopened HTML tag `{}`", &self.tag_name), &range, false); + } + } + + /// Handle a `<` that appeared while parsing a tag. + fn handle_lt_in_tag( + &mut self, + range: Range, + lt_pos: usize, + f: &impl Fn(String, &Range, bool), + ) { + let global_pos = range.start + lt_pos; + // is this check needed? + if global_pos == self.tag_start_pos { + // `<` is in the tag because it is the start. + return; + } + // tried to start a new tag while in a tag + f( + format!("incomplete HTML tag `{}`", &self.tag_name), + &(self.tag_start_pos..global_pos), + false, + ); + self.tag_parsed(); + } + + fn extract_html_tag( + &mut self, + text: &str, + range: &Range, + start_pos: usize, + iter: &mut Peekable>, + f: &impl Fn(String, &Range, bool), + ) { + let mut prev_pos = start_pos; + + 'outer_loop: loop { + let (pos, c) = match iter.peek() { + Some((pos, c)) => (*pos, *c), + // In case we reached the of the doc comment, we want to check that it's an + // unclosed HTML tag. For example "/// (prev_pos, '\0'), + None => break, + }; + prev_pos = pos; + if c == '/' && self.tag_name.is_empty() { + // Checking if this is a closing tag (like `` for ``). + self.is_closing = true; + } else if !self.in_attrs && is_valid_for_html_tag_name(c, self.tag_name.is_empty()) { + self.tag_name.push(c); + } else { + if !self.tag_name.is_empty() { + self.in_attrs = true; + let mut r = Range { start: range.start + start_pos, end: range.start + pos }; + if c == '>' { + // In case we have a tag without attribute, we can consider the span to + // refer to it fully. + r.end += 1; + } + if self.is_closing { + // In case we have "" or even "". + if c != '>' { if !c.is_whitespace() { - if c == '>' { - r.end = range.start + new_pos + 1; - found = true; - } + // It seems like it's not a valid HTML tag. break; } - } - if !found { - break; - } - } - drop_tag(tags, tag_name, r, f); - } else { - let mut is_self_closing = false; - let mut quote_pos = None; - if c != '>' { - let mut quote = None; - let mut after_eq = false; - for (i, c) in text[pos..].char_indices() { - if !c.is_whitespace() { - if let Some(q) = quote { - if c == q { - quote = None; - quote_pos = None; - after_eq = false; + let mut found = false; + for (new_pos, c) in text[pos..].char_indices() { + if !c.is_whitespace() { + if c == '>' { + r.end = range.start + new_pos + 1; + found = true; + } else if c == '<' { + self.handle_lt_in_tag(range.clone(), pos + new_pos, f); } - } else if c == '>' { break; - } else if c == '/' && !after_eq { - is_self_closing = true; - } else { - if is_self_closing { - is_self_closing = false; - } - if (c == '"' || c == '\'') && after_eq { - quote = Some(c); - quote_pos = Some(pos + i); - } else if c == '=' { - after_eq = true; - } } - } else if quote.is_none() { - after_eq = false; + } + if !found { + break 'outer_loop; } } - } - if let Some(quote_pos) = quote_pos { - let qr = Range { start: quote_pos, end: quote_pos }; - f( - format!("unclosed quoted HTML attribute on tag `{tag_name}`"), - &qr, - false, - ); - } - if is_self_closing { - // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus - let valid = ALLOWED_UNCLOSED.contains(&&tag_name[..]) - || tags.iter().take(pos + 1).any(|(at, _)| { - let at = at.to_lowercase(); - at == "svg" || at == "math" - }); - if !valid { - f(format!("invalid self-closing HTML tag `{tag_name}`"), &r, false); - } + self.drop_tag(r, f); + self.tag_parsed(); } else { - tags.push((tag_name, r)); + self.extract_opening_tag(text, range, r, pos, c, iter, f) } } + break; } - break; + iter.next(); } - iter.next(); } -} - -fn extract_tags( - tags: &mut Vec<(String, Range)>, - text: &str, - range: Range, - is_in_comment: &mut Option>, - f: &impl Fn(String, &Range, bool), -) { - let mut iter = text.char_indices().peekable(); - while let Some((start_pos, c)) = iter.next() { - if is_in_comment.is_some() { - if text[start_pos..].starts_with("-->") { - *is_in_comment = None; + fn extract_opening_tag( + &mut self, + text: &str, + range: &Range, + r: Range, + pos: usize, + c: char, + iter: &mut Peekable>, + f: &impl Fn(String, &Range, bool), + ) { + // we can store this as a local, since html5 does require the `/` and `>` + // to not be separated by whitespace. + let mut is_self_closing = false; + if c != '>' { + 'parse_til_gt: { + for (i, c) in text[pos..].char_indices() { + if !c.is_whitespace() { + debug_assert_eq!(self.quote_pos.is_some(), self.quote.is_some()); + if let Some(q) = self.quote { + if c == q { + self.quote = None; + self.quote_pos = None; + self.after_eq = false; + } + } else if c == '>' { + break 'parse_til_gt; + } else if c == '<' { + self.handle_lt_in_tag(range.clone(), pos + i, f); + } else if c == '/' && !self.after_eq { + is_self_closing = true; + } else { + if is_self_closing { + is_self_closing = false; + } + if (c == '"' || c == '\'') && self.after_eq { + self.quote = Some(c); + self.quote_pos = Some(pos + i); + } else if c == '=' { + self.after_eq = true; + } + } + } else if self.quote.is_none() { + self.after_eq = false; + } + if !is_self_closing && !self.tag_name.is_empty() { + iter.next(); + } + } + // if we've run out of text but still haven't found a `>`, + // return early without calling `tag_parsed` or emitting lints. + // this allows us to either find the `>` in a later event + // or emit a lint about it being missing. + return; } - } else if c == '<' { - if text[start_pos..].starts_with("") { + *is_in_comment = None; + } + } else if c == '<' { + // " @@ -105,7 +106,7 @@ pub fn j() {} /// uiapp.run(&env::args().collect::>()); /// ``` /// -/// shouldn't warn! +// shouldn't warn! /// `````` pub fn k() {} @@ -141,14 +142,72 @@ pub fn no_error_2() {} /// pub fn no_error_3() {} +/// >
class="foo"> +/// >
+pub fn no_error_4() {} + /// unfinished ALLOWED_UNCLOSED /// +/// note: CommonMark doesn't allow an html block to start with a multiline tag, +/// so we use `
` a bunch to force these to be parsed as html blocks. +/// ///
/// -//~^ ERROR unclosed HTML tag `img` +//~^ ERROR incomplete HTML tag `img` pub fn r() {} + +/// >
+/// > href="#broken" +pub fn s() {} + +///
+/// +//~^ ERROR incomplete HTML tag `br` +pub fn t() {} + +///
+///
html5 allows this
+pub fn no_error_5() {} + +///
+/// +pub fn no_error_6() {} + +///
+/// what +pub fn no_error_7() {} + +/// Technically this is allowed per the html5 spec, +/// but there's basically no legitemate reason to do it, +/// so we don't allow it. +/// +///

foobar

+//~^ ERROR Unclosed HTML comment +//~| ERROR incomplete HTML tag `p` +pub fn v() {} diff --git a/tests/rustdoc-ui/lints/invalid-html-tags.stderr b/tests/rustdoc-ui/lints/invalid-html-tags.stderr index fc9849ff23c..b6ec22c2479 100644 --- a/tests/rustdoc-ui/lints/invalid-html-tags.stderr +++ b/tests/rustdoc-ui/lints/invalid-html-tags.stderr @@ -52,6 +52,12 @@ error: unclosed HTML tag `p` LL | ///

| ^^^ +error: incomplete HTML tag `script` + --> $DIR/invalid-html-tags.rs:45:5 + | +LL | ///