215 lines
7.9 KiB
Rust
215 lines
7.9 KiB
Rust
use std::sync::Arc;
|
|
|
|
use pyo3::prelude::*;
|
|
use pyo3::types::PyAny;
|
|
use pyo3::types::PyBytes;
|
|
use qroissant_arrow::ListProjection;
|
|
use qroissant_arrow::ProjectionOptions;
|
|
use qroissant_arrow::StringProjection;
|
|
use qroissant_arrow::SymbolProjection;
|
|
use qroissant_core::DecodeOptions as CoreDecodeOptions;
|
|
use qroissant_core::Value as CoreValue;
|
|
use qroissant_core::decode_message_with_options;
|
|
use qroissant_core::encode_message;
|
|
use qroissant_transport::extract_q_error;
|
|
|
|
use crate::errors::PythonError;
|
|
use crate::errors::PythonResult;
|
|
use crate::errors::to_py_err;
|
|
use crate::types::Compression;
|
|
use crate::types::DecodeOptions;
|
|
use crate::types::EncodeOptions;
|
|
use crate::types::Encoding;
|
|
use crate::types::ListInterpretation;
|
|
use crate::types::MessageType;
|
|
use crate::types::StringInterpretation;
|
|
use crate::types::SymbolInterpretation;
|
|
use crate::values::core_value_to_python_with_opts;
|
|
use crate::values::python_to_core_value;
|
|
|
|
/// Maps Python-facing "Interpretation" options to Rust-internal "Projection" options.
|
|
///
|
|
/// The Python API uses "Interpretation" (e.g. `SymbolInterpretation`) as it describes
|
|
/// how the user wants data to be interpreted. The Rust/Arrow layer uses "Projection"
|
|
/// (e.g. `SymbolProjection`) as it describes how values are projected into Arrow arrays.
|
|
/// Both refer to the same concept viewed from different perspectives.
|
|
pub fn decode_options_to_proj_opts(opts: Option<&DecodeOptions>) -> Arc<ProjectionOptions> {
|
|
let opts = opts.map(|o| o.clone()).unwrap_or_default();
|
|
Arc::new(ProjectionOptions {
|
|
symbol: match opts.symbol_interpretation_value() {
|
|
SymbolInterpretation::Utf8 => SymbolProjection::Utf8,
|
|
SymbolInterpretation::LargeUtf8 => SymbolProjection::LargeUtf8,
|
|
SymbolInterpretation::Utf8View => SymbolProjection::Utf8View,
|
|
SymbolInterpretation::Dictionary => SymbolProjection::Dictionary,
|
|
SymbolInterpretation::RawBytes => SymbolProjection::RawBytes,
|
|
},
|
|
string: match opts.string_interpretation_value() {
|
|
StringInterpretation::Utf8 => StringProjection::Utf8,
|
|
StringInterpretation::Binary => StringProjection::Binary,
|
|
},
|
|
list: match opts.list_interpretation_value() {
|
|
ListInterpretation::List => ListProjection::List,
|
|
ListInterpretation::LargeList => ListProjection::LargeList,
|
|
ListInterpretation::ListView => ListProjection::ListView,
|
|
},
|
|
union_mode: match opts.union_mode_value() {
|
|
crate::types::UnionMode::Dense => qroissant_arrow::UnionMode::Dense,
|
|
crate::types::UnionMode::Sparse => qroissant_arrow::UnionMode::Sparse,
|
|
},
|
|
treat_infinity_as_null: opts.treat_infinity_as_null(),
|
|
parallel: opts.parallel_value(),
|
|
assume_symbol_utf8: opts.assume_symbol_utf8_value(),
|
|
})
|
|
}
|
|
|
|
fn decode_options_to_core(opts: &DecodeOptions) -> CoreDecodeOptions {
|
|
CoreDecodeOptions {
|
|
parallel: opts.parallel_value(),
|
|
..CoreDecodeOptions::default()
|
|
}
|
|
}
|
|
|
|
fn ensure_default_encode_options(options: Option<&EncodeOptions>) -> PythonResult<()> {
|
|
if let Some(options) = options
|
|
&& options != &EncodeOptions::default()
|
|
{
|
|
return Err(PythonError::NotImplemented(
|
|
"custom encode options are not implemented yet".to_string(),
|
|
));
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn decode_core_value(
|
|
payload: bytes::Bytes,
|
|
options: Option<&DecodeOptions>,
|
|
) -> PythonResult<(CoreValue, Arc<ProjectionOptions>)> {
|
|
if let Some(message) =
|
|
extract_q_error(payload.as_ref()).map_err(crate::errors::map_transport_error)?
|
|
{
|
|
return Err(PythonError::QRuntime(message));
|
|
}
|
|
let core_opts = options.map(decode_options_to_core).unwrap_or_default();
|
|
let decoded = decode_message_with_options(payload, &core_opts)
|
|
.map_err(|error| PythonError::Decode(error.to_string()))?;
|
|
let proj_opts = decode_options_to_proj_opts(options);
|
|
let (_header, value) = decoded.into_parts();
|
|
Ok((value, proj_opts))
|
|
}
|
|
|
|
/// Wraps a Python `bytes` object in a [`bytes::Bytes`] without copying.
|
|
///
|
|
/// CPython `bytes` objects are immutable and their backing memory is never
|
|
/// moved, so it is sound to hold a raw pointer into them as long as the
|
|
/// `Py<PyBytes>` reference (which increments the CPython refcount) is alive.
|
|
struct PinnedPyBytes {
|
|
_owner: Py<PyBytes>,
|
|
ptr: *const u8,
|
|
len: usize,
|
|
}
|
|
|
|
// SAFETY: `Py<PyBytes>` is `Send`, and the pointed-to memory is immutable.
|
|
unsafe impl Send for PinnedPyBytes {}
|
|
// SAFETY: The data is immutable and the owner keeps it alive.
|
|
unsafe impl Sync for PinnedPyBytes {}
|
|
|
|
impl AsRef<[u8]> for PinnedPyBytes {
|
|
#[inline]
|
|
fn as_ref(&self) -> &[u8] {
|
|
// SAFETY: `ptr` is valid for `len` bytes while `_owner` keeps the
|
|
// CPython bytes object alive (refcount > 0, no deallocation possible).
|
|
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
|
|
}
|
|
}
|
|
|
|
/// Minimum payload size for the zero-copy `PinnedPyBytes` path.
|
|
///
|
|
/// For small payloads the `Arc` allocation inside `Bytes::from_owner` costs
|
|
/// more than a plain `memcpy`, so we fall back to copying below this threshold.
|
|
const ZERO_COPY_MIN_BYTES: usize = 32 * 1024; // 32 KB
|
|
|
|
/// Converts a Python `bytes`-like object into a [`bytes::Bytes`].
|
|
///
|
|
/// For plain `bytes` objects ≥ [`ZERO_COPY_MIN_BYTES`] the underlying buffer
|
|
/// is **borrowed without copying** via [`bytes::Bytes::from_owner`].
|
|
/// Smaller payloads and other buffer protocols (bytearray, memoryview) take a
|
|
/// single copy — same cost as before.
|
|
fn payload_to_bytes(payload: &Bound<'_, PyAny>) -> PyResult<bytes::Bytes> {
|
|
if let Ok(pb) = payload.downcast::<PyBytes>() {
|
|
let data = pb.as_bytes();
|
|
if data.len() >= ZERO_COPY_MIN_BYTES {
|
|
let pinned = PinnedPyBytes {
|
|
_owner: pb.clone().unbind(),
|
|
ptr: data.as_ptr(),
|
|
len: data.len(),
|
|
};
|
|
return Ok(bytes::Bytes::from_owner(pinned));
|
|
}
|
|
return Ok(bytes::Bytes::copy_from_slice(data));
|
|
}
|
|
Ok(bytes::Bytes::from(payload.extract::<Vec<u8>>()?))
|
|
}
|
|
|
|
pub fn encode_core_value_bytes(
|
|
value: &CoreValue,
|
|
options: Option<&EncodeOptions>,
|
|
encoding: Encoding,
|
|
message_type: MessageType,
|
|
compression: Compression,
|
|
) -> PythonResult<Vec<u8>> {
|
|
ensure_default_encode_options(options)?;
|
|
encode_message(
|
|
value,
|
|
encoding.into(),
|
|
message_type.into(),
|
|
compression.into(),
|
|
)
|
|
.map_err(|error| PythonError::Protocol(error.to_string()))
|
|
}
|
|
|
|
#[pyfunction]
|
|
#[pyo3(signature = (payload, /, *, options=None))]
|
|
pub fn decode(
|
|
py: Python<'_>,
|
|
payload: &Bound<'_, PyAny>,
|
|
options: Option<&DecodeOptions>,
|
|
) -> PyResult<Py<PyAny>> {
|
|
let bytes = payload_to_bytes(payload)?;
|
|
let options_clone = options.cloned();
|
|
let (value, proj_opts) = py
|
|
.detach(|| decode_core_value(bytes, options_clone.as_ref()))
|
|
.map_err(to_py_err)?;
|
|
core_value_to_python_with_opts(py, value, proj_opts)
|
|
}
|
|
|
|
#[pyfunction]
|
|
#[pyo3(signature = (value, /, *, options=None, encoding=Encoding::LittleEndian, message_type=MessageType::Asynchronous, compression=Compression::Uncompressed))]
|
|
pub fn encode(
|
|
py: Python<'_>,
|
|
value: &Bound<'_, PyAny>,
|
|
options: Option<&EncodeOptions>,
|
|
encoding: Encoding,
|
|
message_type: MessageType,
|
|
compression: Compression,
|
|
) -> PyResult<Py<PyBytes>> {
|
|
let value = python_to_core_value(value)?;
|
|
let options_clone = options.cloned();
|
|
let payload = py
|
|
.detach(|| {
|
|
encode_core_value_bytes(
|
|
&value,
|
|
options_clone.as_ref(),
|
|
encoding,
|
|
message_type,
|
|
compression,
|
|
)
|
|
})
|
|
.map_err(to_py_err)?;
|
|
Ok(PyBytes::new(py, &payload).unbind())
|
|
}
|
|
|
|
pub fn register(module: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
module.add_function(wrap_pyfunction!(decode, module)?)?;
|
|
module.add_function(wrap_pyfunction!(encode, module)?)?;
|
|
Ok(())
|
|
}
|