[nca] Use better tight loop allocation schemes (none at all) for AES decrypt/encrypt and force MbedTLS to use AES x86_64 instructions (#2750)

Uses stack instead of allocating stuff haphazardly (16 bytes and 512 bytes respectively) - removes malloc() pollution and all that nasty stuff from tight loops
Original work by Ribbit but edited by me.
Will NOT bring a massive speedup since the main bottleneck is mbedtls itself, but may bring nice oddities to STARTUP TIMES nonetheless.
AES instructions being forced wont affect CPUs without them since there is always a runtime check for them.

Signed-off-by: lizzie lizzie@eden-emu.dev
Co-authored-by: Ribbit <ribbit@placeholder.com>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2750
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
This commit is contained in:
lizzie 2025-10-17 05:08:51 +02:00 committed by crueter
parent 551f244dfd
commit 440ee4916d
No known key found for this signature in database
GPG key ID: 425ACD2D4830EBC6
9 changed files with 200 additions and 130 deletions

View file

@ -1,7 +1,11 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <array>
#include <vector>
#include <mbedtls/cipher.h>
#include "common/assert.h"
#include "common/logging/log.h"
@ -71,37 +75,37 @@ void AESCipher<Key, KeySize>::Transcode(const u8* src, std::size_t size, u8* des
mbedtls_cipher_reset(context);
// Only ECB strictly requires block sized chunks.
std::size_t written = 0;
if (mbedtls_cipher_get_cipher_mode(context) == MBEDTLS_MODE_XTS) {
if (mbedtls_cipher_get_cipher_mode(context) != MBEDTLS_MODE_ECB) {
mbedtls_cipher_update(context, src, size, dest, &written);
if (written != size) {
LOG_WARNING(Crypto, "Not all data was decrypted requested={:016X}, actual={:016X}.",
size, written);
}
} else {
const auto block_size = mbedtls_cipher_get_block_size(context);
if (size < block_size) {
std::vector<u8> block(block_size);
std::memcpy(block.data(), src, size);
Transcode(block.data(), block.size(), block.data(), op);
std::memcpy(dest, block.data(), size);
return;
}
if (written != size)
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", size, written);
return;
}
for (std::size_t offset = 0; offset < size; offset += block_size) {
auto length = std::min<std::size_t>(block_size, size - offset);
mbedtls_cipher_update(context, src + offset, length, dest + offset, &written);
if (written != length) {
if (length < block_size) {
std::vector<u8> block(block_size);
std::memcpy(block.data(), src + offset, length);
Transcode(block.data(), block.size(), block.data(), op);
std::memcpy(dest + offset, block.data(), length);
return;
}
LOG_WARNING(Crypto, "Not all data was decrypted requested={:016X}, actual={:016X}.",
length, written);
// ECB path: operate in block sized chunks and mirror previous behavior.
const auto block_size = mbedtls_cipher_get_block_size(context);
if (size < block_size) {
std::vector<u8> block(block_size);
std::memcpy(block.data(), src, size);
Transcode(block.data(), block.size(), block.data(), op);
std::memcpy(dest, block.data(), size);
return;
}
for (std::size_t offset = 0; offset < size; offset += block_size) {
const auto length = std::min<std::size_t>(block_size, size - offset);
mbedtls_cipher_update(context, src + offset, length, dest + offset, &written);
if (written != length) {
if (length < block_size) {
std::vector<u8> block(block_size);
std::memcpy(block.data(), src + offset, length);
Transcode(block.data(), block.size(), block.data(), op);
std::memcpy(dest + offset, block.data(), length);
return;
}
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", length, written);
}
}
}

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -15,26 +18,36 @@ std::size_t CTREncryptionLayer::Read(u8* data, std::size_t length, std::size_t o
if (length == 0)
return 0;
const auto sector_offset = offset & 0xF;
if (sector_offset == 0) {
std::size_t total_read = 0;
// Handle an initial misaligned portion if needed.
if (auto const sector_offset = offset & 0xF; sector_offset != 0) {
const std::size_t aligned_off = offset - sector_offset;
std::array<u8, 0x10> block{};
if (auto const got = base->Read(block.data(), block.size(), aligned_off); got != 0) {
UpdateIV(base_offset + aligned_off);
cipher.Transcode(block.data(), got, block.data(), Op::Decrypt);
auto const to_copy = std::min<std::size_t>(length, got > sector_offset ? got - sector_offset : 0);
if (to_copy > 0) {
std::memcpy(data, block.data() + sector_offset, to_copy);
data += to_copy;
offset += to_copy;
length -= to_copy;
total_read += to_copy;
}
} else {
return 0;
}
}
if (length > 0) {
// Now aligned to 0x10
UpdateIV(base_offset + offset);
std::vector<u8> raw = base->ReadBytes(length, offset);
cipher.Transcode(raw.data(), raw.size(), data, Op::Decrypt);
return length;
const std::size_t got = base->Read(data, length, offset);
if (got > 0) {
cipher.Transcode(data, got, data, Op::Decrypt);
total_read += got;
}
}
// offset does not fall on block boundary (0x10)
std::vector<u8> block = base->ReadBytes(0x10, offset - sector_offset);
UpdateIV(base_offset + offset - sector_offset);
cipher.Transcode(block.data(), block.size(), block.data(), Op::Decrypt);
std::size_t read = 0x10 - sector_offset;
if (length + sector_offset < 0x10) {
std::memcpy(data, block.data() + sector_offset, std::min<u64>(length, read));
return std::min<u64>(length, read);
}
std::memcpy(data, block.data() + sector_offset, read);
return read + Read(data + read, length - read, offset + read);
return total_read;
}
void CTREncryptionLayer::SetIV(const IVData& iv_) {

View file

@ -5,12 +5,13 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <array>
#include <cstring>
#include "core/crypto/xts_encryption_layer.h"
namespace Core::Crypto {
constexpr u64 XTS_SECTOR_SIZE = 0x4000;
constexpr std::size_t XTS_SECTOR_SIZE = 0x4000;
XTSEncryptionLayer::XTSEncryptionLayer(FileSys::VirtualFile base_, Key256 key_)
: EncryptionLayer(std::move(base_)), cipher(key_, Mode::XTS) {}
@ -19,41 +20,67 @@ std::size_t XTSEncryptionLayer::Read(u8* data, std::size_t length, std::size_t o
if (length == 0)
return 0;
const auto sector_offset = offset & 0x3FFF;
if (sector_offset == 0) {
if (length % XTS_SECTOR_SIZE == 0) {
std::vector<u8> raw = base->ReadBytes(length, offset);
cipher.XTSTranscode(raw.data(), raw.size(), data, offset / XTS_SECTOR_SIZE,
std::size_t total_read = 0;
// Handle initial unaligned part within a sector.
if (auto const sector_offset = offset % XTS_SECTOR_SIZE; sector_offset != 0) {
const std::size_t aligned_off = offset - sector_offset;
std::array<u8, XTS_SECTOR_SIZE> block{};
if (auto const got = base->Read(block.data(), XTS_SECTOR_SIZE, aligned_off); got > 0) {
if (got < XTS_SECTOR_SIZE)
std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got);
cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(), aligned_off / XTS_SECTOR_SIZE,
XTS_SECTOR_SIZE, Op::Decrypt);
return raw.size();
auto const to_copy = std::min<std::size_t>(length, got > sector_offset ? got - sector_offset : 0);
if (to_copy > 0) {
std::memcpy(data, block.data() + sector_offset, to_copy);
data += to_copy;
offset += to_copy;
length -= to_copy;
total_read += to_copy;
}
} else {
return 0;
}
if (length > XTS_SECTOR_SIZE) {
const auto rem = length % XTS_SECTOR_SIZE;
const auto read = length - rem;
return Read(data, read, offset) + Read(data + read, rem, offset + read);
}
std::vector<u8> buffer = base->ReadBytes(XTS_SECTOR_SIZE, offset);
if (buffer.size() < XTS_SECTOR_SIZE)
buffer.resize(XTS_SECTOR_SIZE);
cipher.XTSTranscode(buffer.data(), buffer.size(), buffer.data(), offset / XTS_SECTOR_SIZE,
XTS_SECTOR_SIZE, Op::Decrypt);
std::memcpy(data, buffer.data(), (std::min)(buffer.size(), length));
return (std::min)(buffer.size(), length);
}
// offset does not fall on block boundary (0x4000)
std::vector<u8> block = base->ReadBytes(0x4000, offset - sector_offset);
if (block.size() < XTS_SECTOR_SIZE)
block.resize(XTS_SECTOR_SIZE);
cipher.XTSTranscode(block.data(), block.size(), block.data(),
(offset - sector_offset) / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt);
const std::size_t read = XTS_SECTOR_SIZE - sector_offset;
if (length + sector_offset < XTS_SECTOR_SIZE) {
std::memcpy(data, block.data() + sector_offset, std::min<u64>(length, read));
return std::min<u64>(length, read);
if (length > 0) {
// Process aligned middle inplace, in sector sized multiples.
while (length >= XTS_SECTOR_SIZE) {
const std::size_t req = (length / XTS_SECTOR_SIZE) * XTS_SECTOR_SIZE;
const std::size_t got = base->Read(data, req, offset);
if (got == 0) {
return total_read;
}
const std::size_t got_rounded = got - (got % XTS_SECTOR_SIZE);
if (got_rounded > 0) {
cipher.XTSTranscode(data, got_rounded, data, offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt);
data += got_rounded;
offset += got_rounded;
length -= got_rounded;
total_read += got_rounded;
}
// If we didn't get a full sector next, break to handle tail.
if (got_rounded != got) {
break;
}
}
// Handle tail within a sector, if any.
if (length > 0) {
std::array<u8, XTS_SECTOR_SIZE> block{};
const std::size_t got = base->Read(block.data(), XTS_SECTOR_SIZE, offset);
if (got > 0) {
if (got < XTS_SECTOR_SIZE) {
std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got);
}
cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(),
offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt);
const std::size_t to_copy = std::min<std::size_t>(length, got);
std::memcpy(data, block.data(), to_copy);
total_read += to_copy;
}
}
}
std::memcpy(data, block.data() + sector_offset, read);
return read + Read(data + read, length - read, offset + read);
return total_read;
}
} // namespace Core::Crypto