qroissant/crates/qroissant-arrow/src/ingestion.rs

1576 lines
50 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Arrow ingestion: converts Arrow arrays and record batches into q `Value` trees.
//!
//! This is the reverse direction of [`crate::projection`]. Arrow field
//! metadata produced by the projection layer (`qroissant.shape`,
//! `qroissant.primitive`, etc.) is consumed here so that round-trips through
//! Arrow preserve exact q semantics.
//!
//! No PyO3 or Python dependencies are allowed in this crate; PyCapsule
//! handling lives in `qroissant-python`.
use arrow_array::Array;
use arrow_array::ArrayRef;
use arrow_array::BinaryArray;
use arrow_array::BinaryViewArray;
use arrow_array::BooleanArray;
use arrow_array::Date32Array;
use arrow_array::DurationMicrosecondArray;
use arrow_array::DurationMillisecondArray;
use arrow_array::DurationNanosecondArray;
use arrow_array::DurationSecondArray;
use arrow_array::FixedSizeBinaryArray;
use arrow_array::Float32Array;
use arrow_array::Float64Array;
use arrow_array::Int16Array;
use arrow_array::Int32Array;
use arrow_array::Int64Array;
use arrow_array::LargeBinaryArray;
use arrow_array::LargeListArray;
use arrow_array::LargeStringArray;
use arrow_array::ListArray;
use arrow_array::MapArray;
use arrow_array::RecordBatch;
use arrow_array::StringArray;
use arrow_array::StringViewArray;
use arrow_array::StructArray;
use arrow_array::Time32MillisecondArray;
use arrow_array::Time32SecondArray;
use arrow_array::Time64MicrosecondArray;
use arrow_array::Time64NanosecondArray;
use arrow_array::TimestampMicrosecondArray;
use arrow_array::TimestampMillisecondArray;
use arrow_array::TimestampNanosecondArray;
use arrow_array::TimestampSecondArray;
use arrow_array::UInt8Array;
use arrow_schema::DataType;
use arrow_schema::Field as ArrowField;
use arrow_schema::SchemaRef;
use arrow_schema::TimeUnit;
use qroissant_core::Atom;
use qroissant_core::Attribute;
use qroissant_core::Dictionary;
use qroissant_core::List;
use qroissant_core::Table;
use qroissant_core::Value;
use qroissant_core::Vector;
use qroissant_core::VectorData;
use qroissant_kernels::nulls::Q_NULL_DATE;
use qroissant_kernels::nulls::Q_NULL_MINUTE;
use qroissant_kernels::nulls::Q_NULL_SECOND;
use qroissant_kernels::nulls::Q_NULL_SHORT;
use qroissant_kernels::nulls::Q_NULL_TIME;
use qroissant_kernels::nulls::Q_NULL_TIMESPAN;
use qroissant_kernels::nulls::Q_NULL_TIMESTAMP;
use qroissant_kernels::temporal::DATE_OFFSET_DAYS;
use qroissant_kernels::temporal::TIMESTAMP_OFFSET_NS;
use crate::error::IngestionError;
use crate::error::IngestionResult;
/// Converts a `Vec<T>` to `bytes::Bytes` via zero-copy reinterpretation.
fn vec_to_bytes<T: bytemuck::NoUninit>(values: Vec<T>) -> bytes::Bytes {
// Safety: bytemuck::cast_vec requires NoUninit, which guarantees no padding.
let byte_vec: Vec<u8> = bytemuck::allocation::cast_vec(values);
bytes::Bytes::from(byte_vec)
}
use crate::metadata::ATTRIBUTE_KEY;
use crate::metadata::PRIMITIVE_KEY;
use crate::metadata::SHAPE_KEY;
use crate::metadata::SORTED_KEY;
// ---------------------------------------------------------------------------
// Metadata hint extraction
// ---------------------------------------------------------------------------
#[derive(Clone, Copy, Default, Debug)]
struct IngestHint {
shape: Option<IngestShape>,
primitive: Option<IngestPrimitive>,
attribute: Option<Attribute>,
sorted: Option<bool>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum IngestShape {
Atom,
Vector,
List,
Dictionary,
Table,
UnaryPrimitive,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum IngestPrimitive {
Boolean,
Guid,
Byte,
Short,
Int,
Long,
Real,
Float,
Char,
Symbol,
Timestamp,
Month,
Date,
Datetime,
Timespan,
Minute,
Second,
Time,
}
fn hint_from_field(field: &ArrowField) -> IngestHint {
let meta = field.metadata();
IngestHint {
shape: meta.get(SHAPE_KEY).and_then(|s| parse_shape(s)),
primitive: meta.get(PRIMITIVE_KEY).and_then(|s| parse_primitive(s)),
attribute: meta.get(ATTRIBUTE_KEY).and_then(|s| parse_attribute(s)),
sorted: meta.get(SORTED_KEY).and_then(|s| s.parse::<bool>().ok()),
}
}
fn parse_shape(s: &str) -> Option<IngestShape> {
match s {
"atom" => Some(IngestShape::Atom),
"vector" => Some(IngestShape::Vector),
"list" => Some(IngestShape::List),
"dictionary" => Some(IngestShape::Dictionary),
"table" => Some(IngestShape::Table),
"unary_primitive" => Some(IngestShape::UnaryPrimitive),
_ => None,
}
}
fn parse_primitive(s: &str) -> Option<IngestPrimitive> {
match s {
"boolean" => Some(IngestPrimitive::Boolean),
"guid" => Some(IngestPrimitive::Guid),
"byte" => Some(IngestPrimitive::Byte),
"short" => Some(IngestPrimitive::Short),
"int" => Some(IngestPrimitive::Int),
"long" => Some(IngestPrimitive::Long),
"real" => Some(IngestPrimitive::Real),
"float" => Some(IngestPrimitive::Float),
"char" => Some(IngestPrimitive::Char),
"symbol" => Some(IngestPrimitive::Symbol),
"timestamp" => Some(IngestPrimitive::Timestamp),
"month" => Some(IngestPrimitive::Month),
"date" => Some(IngestPrimitive::Date),
"datetime" => Some(IngestPrimitive::Datetime),
"timespan" => Some(IngestPrimitive::Timespan),
"minute" => Some(IngestPrimitive::Minute),
"second" => Some(IngestPrimitive::Second),
"time" => Some(IngestPrimitive::Time),
_ => None,
}
}
fn parse_attribute(s: &str) -> Option<Attribute> {
match s {
"none" => Some(Attribute::None),
"sorted" => Some(Attribute::Sorted),
"unique" => Some(Attribute::Unique),
"parted" => Some(Attribute::Parted),
"grouped" => Some(Attribute::Grouped),
_ => None,
}
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/// Convert an Arrow array + field descriptor into a q `Value`.
pub fn ingest_array(array: ArrayRef, field: &ArrowField) -> IngestionResult<Value> {
let hint = hint_from_field(field);
ingest_with_hint(array, hint)
}
/// Convert an Arrow record batch into a q table `Value`.
pub fn ingest_record_batch(batch: RecordBatch) -> IngestionResult<Value> {
let schema = batch.schema();
let mut column_names = Vec::with_capacity(batch.num_columns());
let mut columns = Vec::with_capacity(batch.num_columns());
for (index, field) in schema.fields().iter().enumerate() {
column_names.push(bytes::Bytes::copy_from_slice(field.name().as_bytes()));
columns.push(ingest_array(batch.column(index).clone(), field.as_ref())?);
}
let table = Table::new(Attribute::None, column_names, columns);
table
.validate()
.map_err(|e| IngestionError::Unsupported(e.to_string()))?;
Ok(Value::Table(table))
}
/// Convert a sequence of record batches (a stream) into a q table `Value`.
///
/// All batches must share the same schema. The batches are concatenated using
/// `arrow_select::concat::concat_batches` before ingestion.
pub fn ingest_record_batch_reader(
schema: SchemaRef,
batches: impl IntoIterator<Item = Result<RecordBatch, arrow_schema::ArrowError>>,
) -> IngestionResult<Value> {
let batches: Vec<RecordBatch> = batches.into_iter().collect::<Result<_, _>>()?;
if batches.is_empty() {
// Produce an empty table with the correct schema.
let column_names: Vec<bytes::Bytes> = schema
.fields()
.iter()
.map(|f| bytes::Bytes::copy_from_slice(f.name().as_bytes()))
.collect();
let columns: Vec<Value> = schema
.fields()
.iter()
.map(|f| ingest_array(arrow_array::new_empty_array(f.data_type()), f.as_ref()))
.collect::<Result<_, _>>()?;
let table = Table::new(Attribute::None, column_names, columns);
return Ok(Value::Table(table));
}
let merged = arrow_select::concat::concat_batches(&schema, &batches)?;
ingest_record_batch(merged)
}
// ---------------------------------------------------------------------------
// Main dispatch
// ---------------------------------------------------------------------------
fn ingest_with_hint(array: ArrayRef, hint: IngestHint) -> IngestionResult<Value> {
let shape = hint
.shape
.unwrap_or_else(|| default_shape(array.data_type()));
match shape {
IngestShape::UnaryPrimitive => {
if array.len() != 1 {
return Err(IngestionError::Unsupported(format!(
"unary_primitive shape requires length 1, got {}",
array.len()
)));
}
Ok(Value::UnaryPrimitive { opcode: -128 })
}
IngestShape::Table => ingest_table(array, hint),
IngestShape::Dictionary => ingest_dictionary(array, hint),
IngestShape::List => ingest_list(array, hint),
IngestShape::Atom | IngestShape::Vector => ingest_scalar_or_vector(array, shape, hint),
}
}
fn default_shape(dt: &DataType) -> IngestShape {
match dt {
DataType::Null => IngestShape::List,
DataType::List(_) | DataType::LargeList(_) => IngestShape::List,
// Multiple binary blobs default to a list of char vectors.
// Use explicit metadata (qroissant.shape=vector) for char vector.
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => IngestShape::List,
DataType::Map(_, _) => IngestShape::Dictionary,
DataType::Struct(_) => IngestShape::Table,
_ => IngestShape::Vector,
}
}
// ---------------------------------------------------------------------------
// Table ingestion (Struct array)
// ---------------------------------------------------------------------------
fn ingest_table(array: ArrayRef, hint: IngestHint) -> IngestionResult<Value> {
let attribute = hint.attribute.unwrap_or(Attribute::None);
let struct_array = array
.as_any()
.downcast_ref::<StructArray>()
.ok_or_else(|| {
IngestionError::Unsupported(format!(
"q table ingestion requires a StructArray, found {}",
array.data_type()
))
})?;
let fields = match array.data_type() {
DataType::Struct(fields) => fields.clone(),
other => {
return Err(IngestionError::Unsupported(format!(
"q table ingestion requires a struct field, found {other}"
)));
}
};
let mut column_names = Vec::with_capacity(fields.len());
let mut columns = Vec::with_capacity(fields.len());
for (i, child_field) in fields.iter().enumerate() {
column_names.push(bytes::Bytes::copy_from_slice(child_field.name().as_bytes()));
columns.push(ingest_array(
struct_array.column(i).clone(),
child_field.as_ref(),
)?);
}
let table = Table::new(attribute, column_names, columns);
table
.validate()
.map_err(|e| IngestionError::Unsupported(e.to_string()))?;
Ok(Value::Table(table))
}
// ---------------------------------------------------------------------------
// Dictionary ingestion (Map array)
// ---------------------------------------------------------------------------
fn ingest_dictionary(array: ArrayRef, hint: IngestHint) -> IngestionResult<Value> {
let map_array = array.as_any().downcast_ref::<MapArray>().ok_or_else(|| {
IngestionError::Unsupported(format!(
"q dictionary ingestion requires a MapArray, found {}",
array.data_type()
))
})?;
if map_array.len() != 1 || map_array.is_null(0) {
return Err(IngestionError::Unsupported(
"q dictionary ingestion requires a non-null length-1 Arrow map".to_string(),
));
}
let entries = map_array.value(0);
let sorted = hint.sorted.unwrap_or(false);
let entry_fields = entries.fields().clone();
let keys = ingest_array(entries.column(0).clone(), entry_fields[0].as_ref())?;
let values = ingest_array(entries.column(1).clone(), entry_fields[1].as_ref())?;
let dict = Dictionary::new(sorted, keys, values);
dict.validate()
.map_err(|e| IngestionError::Unsupported(e.to_string()))?;
Ok(Value::Dictionary(dict))
}
// ---------------------------------------------------------------------------
// List ingestion (List / LargeList / Binary / BinaryView arrays)
// ---------------------------------------------------------------------------
fn ingest_list(array: ArrayRef, hint: IngestHint) -> IngestionResult<Value> {
let attribute = hint.attribute.unwrap_or(Attribute::None);
match array.data_type() {
DataType::Null => {
let values = (0..array.len())
.map(|_| Value::UnaryPrimitive { opcode: -128 })
.collect();
Ok(Value::List(List::new(attribute, values)))
}
DataType::List(child_field) => {
let child_field = child_field.clone();
let list_array = array
.as_any()
.downcast_ref::<ListArray>()
.expect("List datatype must match ListArray");
let mut values = Vec::with_capacity(list_array.len());
for i in 0..list_array.len() {
let child = list_array.value(i);
values.push(ingest_array(child, child_field.as_ref())?);
}
Ok(Value::List(List::new(attribute, values)))
}
DataType::LargeList(child_field) => {
let child_field = child_field.clone();
let list_array = array
.as_any()
.downcast_ref::<LargeListArray>()
.expect("LargeList datatype must match LargeListArray");
let mut values = Vec::with_capacity(list_array.len());
for i in 0..list_array.len() {
let child = list_array.value(i);
values.push(ingest_array(child, child_field.as_ref())?);
}
Ok(Value::List(List::new(attribute, values)))
}
DataType::Binary => {
let binary = array
.as_any()
.downcast_ref::<BinaryArray>()
.expect("Binary datatype must match BinaryArray");
let values = (0..binary.len())
.map(|i| {
Value::Vector(Vector::new(
Attribute::None,
VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))),
))
})
.collect();
Ok(Value::List(List::new(attribute, values)))
}
DataType::LargeBinary => {
let binary = array
.as_any()
.downcast_ref::<LargeBinaryArray>()
.expect("LargeBinary datatype must match LargeBinaryArray");
let values = (0..binary.len())
.map(|i| {
Value::Vector(Vector::new(
Attribute::None,
VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))),
))
})
.collect();
Ok(Value::List(List::new(attribute, values)))
}
DataType::BinaryView => {
let binary = array
.as_any()
.downcast_ref::<BinaryViewArray>()
.expect("BinaryView datatype must match BinaryViewArray");
let values = (0..binary.len())
.map(|i| {
Value::Vector(Vector::new(
Attribute::None,
VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))),
))
})
.collect();
Ok(Value::List(List::new(attribute, values)))
}
other => Err(IngestionError::Unsupported(format!(
"q list ingestion from Arrow data type {other} is not supported"
))),
}
}
// ---------------------------------------------------------------------------
// Scalar / vector ingestion
// ---------------------------------------------------------------------------
fn ingest_scalar_or_vector(
array: ArrayRef,
shape: IngestShape,
hint: IngestHint,
) -> IngestionResult<Value> {
let attribute = hint.attribute.unwrap_or(Attribute::None);
let is_atom = shape == IngestShape::Atom;
if is_atom && array.len() != 1 {
return Err(IngestionError::Unsupported(format!(
"q atom shape requested but Arrow array has length {}",
array.len()
)));
}
match array.data_type() {
DataType::Boolean => ingest_boolean(&array, is_atom, attribute),
DataType::UInt8 => {
let prim = hint.primitive.unwrap_or(IngestPrimitive::Byte);
ingest_u8(&array, prim, is_atom, attribute)
}
DataType::Int16 => ingest_i16(&array, is_atom, attribute),
DataType::Int32 => {
let prim = hint.primitive.unwrap_or(IngestPrimitive::Int);
ingest_i32(&array, prim, is_atom, attribute)
}
DataType::Int64 => ingest_i64(&array, is_atom, attribute),
DataType::Float32 => ingest_f32(&array, is_atom, attribute),
DataType::Float64 => {
let prim = hint.primitive.unwrap_or(IngestPrimitive::Float);
ingest_f64(&array, prim, is_atom, attribute)
}
DataType::FixedSizeBinary(1) => {
let prim = hint.primitive.unwrap_or(IngestPrimitive::Char);
ingest_fixed_binary_1(&array, prim, is_atom, attribute)
}
DataType::FixedSizeBinary(16) => ingest_fixed_binary_16(&array, is_atom, attribute),
DataType::Utf8 => ingest_symbols_utf8(&array, is_atom, attribute),
DataType::LargeUtf8 => ingest_symbols_large_utf8(&array, is_atom, attribute),
DataType::Utf8View => ingest_symbols_utf8_view(&array, is_atom, attribute),
DataType::Dictionary(_, _) => ingest_symbols_dictionary(&array, is_atom, attribute),
DataType::Binary => ingest_binary_as_char(&array, is_atom, attribute),
DataType::LargeBinary => ingest_large_binary_as_char(&array, is_atom, attribute),
DataType::BinaryView => ingest_binary_view_as_char(&array, is_atom, attribute),
DataType::Date32 => ingest_date32(&array, is_atom, attribute),
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
if tz.is_some() {
return Err(IngestionError::Unsupported(
"Arrow timestamps with timezone cannot be ingested into q".to_string(),
));
}
ingest_timestamp_ns(&array, is_atom, attribute)
}
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
if tz.is_some() {
return Err(IngestionError::Unsupported(
"Arrow timestamps with timezone cannot be ingested into q".to_string(),
));
}
ingest_timestamp_us(&array, is_atom, attribute)
}
DataType::Timestamp(TimeUnit::Millisecond, tz) => {
if tz.is_some() {
return Err(IngestionError::Unsupported(
"Arrow timestamps with timezone cannot be ingested into q".to_string(),
));
}
ingest_timestamp_ms(&array, is_atom, attribute)
}
DataType::Timestamp(TimeUnit::Second, tz) => {
if tz.is_some() {
return Err(IngestionError::Unsupported(
"Arrow timestamps with timezone cannot be ingested into q".to_string(),
));
}
ingest_timestamp_s(&array, is_atom, attribute)
}
DataType::Duration(TimeUnit::Nanosecond) => ingest_duration_ns(&array, is_atom, attribute),
DataType::Duration(TimeUnit::Microsecond) => ingest_duration_us(&array, is_atom, attribute),
DataType::Duration(TimeUnit::Millisecond) => ingest_duration_ms(&array, is_atom, attribute),
DataType::Duration(TimeUnit::Second) => ingest_duration_s(&array, is_atom, attribute),
DataType::Time32(TimeUnit::Second) => {
let prim = hint.primitive.unwrap_or(IngestPrimitive::Second);
ingest_time32_second(&array, prim, is_atom, attribute)
}
DataType::Time32(TimeUnit::Millisecond) => ingest_time32_ms(&array, is_atom, attribute),
DataType::Time64(TimeUnit::Microsecond) => ingest_time64_us(&array, is_atom, attribute),
DataType::Time64(TimeUnit::Nanosecond) => ingest_time64_ns(&array, is_atom, attribute),
other => Err(IngestionError::Unsupported(format!(
"q ingestion from Arrow data type {other} is not supported"
))),
}
}
// ---------------------------------------------------------------------------
// Boolean
// ---------------------------------------------------------------------------
fn ingest_boolean(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<BooleanArray>()
.expect("Boolean datatype must match BooleanArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow boolean arrays with nulls cannot be ingested as q boolean vectors; \
use a general list shape instead"
.to_string(),
));
}
let values: Vec<u8> = (0..arr.len())
.map(|i| if arr.value(i) { 1 } else { 0 })
.collect();
if is_atom {
Ok(Value::Atom(Atom::Boolean(values[0] != 0)))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Boolean(bytes::Bytes::from(values)),
)))
}
}
// ---------------------------------------------------------------------------
// UInt8 (Byte or Char)
// ---------------------------------------------------------------------------
fn ingest_u8(
array: &ArrayRef,
prim: IngestPrimitive,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<UInt8Array>()
.expect("UInt8 datatype must match UInt8Array");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow UInt8 arrays with nulls cannot be ingested as q byte/char".to_string(),
));
}
let values: Vec<u8> = arr.values().to_vec();
match prim {
IngestPrimitive::Char => {
if is_atom {
Ok(Value::Atom(Atom::Char(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Char(bytes::Bytes::from(values)),
)))
}
}
_ => {
if is_atom {
Ok(Value::Atom(Atom::Byte(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Byte(bytes::Bytes::from(values)),
)))
}
}
}
}
// ---------------------------------------------------------------------------
// Int16 (Short)
// ---------------------------------------------------------------------------
fn ingest_i16(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Int16Array>()
.expect("Int16 datatype must match Int16Array");
let mut values: Vec<i16> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_SHORT;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Short(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Short(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Int32 (Int, Month, Date, Minute, Second, Time)
// ---------------------------------------------------------------------------
fn ingest_i32(
array: &ArrayRef,
prim: IngestPrimitive,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Int32Array>()
.expect("Int32 datatype must match Int32Array");
let mut values: Vec<i32> = arr.values().to_vec();
if arr.null_count() != 0 {
let null_sentinel = i32::MIN; // all i32 q types share i32::MIN as null
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = null_sentinel;
}
}
}
if is_atom {
let v = values[0];
let atom = match prim {
IngestPrimitive::Month => Atom::Month(v),
IngestPrimitive::Date => Atom::Date(v),
IngestPrimitive::Minute => Atom::Minute(v),
IngestPrimitive::Second => Atom::Second(v),
IngestPrimitive::Time => Atom::Time(v),
_ => Atom::Int(v),
};
Ok(Value::Atom(atom))
} else {
let bytes = vec_to_bytes(values);
let data = match prim {
IngestPrimitive::Month => VectorData::Month(bytes),
IngestPrimitive::Date => VectorData::Date(bytes),
IngestPrimitive::Minute => VectorData::Minute(bytes),
IngestPrimitive::Second => VectorData::Second(bytes),
IngestPrimitive::Time => VectorData::Time(bytes),
_ => VectorData::Int(bytes),
};
Ok(Value::Vector(Vector::new(attribute, data)))
}
}
// ---------------------------------------------------------------------------
// Int64 (Long)
// ---------------------------------------------------------------------------
fn ingest_i64(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Int64Array>()
.expect("Int64 datatype must match Int64Array");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = i64::MIN;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Long(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Long(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Float32 (Real)
// ---------------------------------------------------------------------------
fn ingest_f32(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Float32Array>()
.expect("Float32 datatype must match Float32Array");
let mut values: Vec<f32> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = f32::NAN;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Real(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Real(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Float64 (Float, Datetime)
// ---------------------------------------------------------------------------
fn ingest_f64(
array: &ArrayRef,
prim: IngestPrimitive,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Float64Array>()
.expect("Float64 datatype must match Float64Array");
let mut values: Vec<f64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = f64::NAN;
}
}
}
if is_atom {
let v = values[0];
let atom = match prim {
IngestPrimitive::Datetime => Atom::Datetime(v),
_ => Atom::Float(v),
};
Ok(Value::Atom(atom))
} else {
let bytes = vec_to_bytes(values);
let data = match prim {
IngestPrimitive::Datetime => VectorData::Datetime(bytes),
_ => VectorData::Float(bytes),
};
Ok(Value::Vector(Vector::new(attribute, data)))
}
}
// ---------------------------------------------------------------------------
// FixedSizeBinary(1) Char or Byte
// ---------------------------------------------------------------------------
fn ingest_fixed_binary_1(
array: &ArrayRef,
prim: IngestPrimitive,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.expect("FixedSizeBinary(1) datatype must match FixedSizeBinaryArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow FixedSizeBinary(1) arrays with nulls cannot be ingested as q char/byte"
.to_string(),
));
}
let values: Vec<u8> = (0..arr.len()).map(|i| arr.value(i)[0]).collect();
match prim {
IngestPrimitive::Byte => {
if is_atom {
Ok(Value::Atom(Atom::Byte(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Byte(bytes::Bytes::from(values)),
)))
}
}
_ => {
if is_atom {
Ok(Value::Atom(Atom::Char(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Char(bytes::Bytes::from(values)),
)))
}
}
}
}
// ---------------------------------------------------------------------------
// FixedSizeBinary(16) Guid
// ---------------------------------------------------------------------------
fn ingest_fixed_binary_16(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.expect("FixedSizeBinary(16) datatype must match FixedSizeBinaryArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow FixedSizeBinary(16) arrays with nulls cannot be ingested as q guid".to_string(),
));
}
let values: Vec<[u8; 16]> = (0..arr.len())
.map(|i| {
let mut buf = [0u8; 16];
buf.copy_from_slice(arr.value(i));
buf
})
.collect();
if is_atom {
Ok(Value::Atom(Atom::Guid(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::from_guids(&values),
)))
}
}
// ---------------------------------------------------------------------------
// Symbol (various string types)
// ---------------------------------------------------------------------------
fn strings_to_symbol_value(strings: Vec<Vec<u8>>, is_atom: bool, attribute: Attribute) -> Value {
if is_atom {
Value::Atom(Atom::Symbol(bytes::Bytes::from(
strings.into_iter().next().unwrap_or_default(),
)))
} else {
Value::Vector(Vector::new(
attribute,
VectorData::Symbol(strings.into_iter().map(bytes::Bytes::from).collect()),
))
}
}
fn ingest_symbols_utf8(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<StringArray>()
.expect("Utf8 datatype must match StringArray");
let values: Vec<Vec<u8>> = (0..arr.len())
.map(|i| {
if arr.is_null(i) {
vec![]
} else {
arr.value(i).as_bytes().to_vec()
}
})
.collect();
Ok(strings_to_symbol_value(values, is_atom, attribute))
}
fn ingest_symbols_large_utf8(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<LargeStringArray>()
.expect("LargeUtf8 datatype must match LargeStringArray");
let values: Vec<Vec<u8>> = (0..arr.len())
.map(|i| {
if arr.is_null(i) {
vec![]
} else {
arr.value(i).as_bytes().to_vec()
}
})
.collect();
Ok(strings_to_symbol_value(values, is_atom, attribute))
}
fn ingest_symbols_utf8_view(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<StringViewArray>()
.expect("Utf8View datatype must match StringViewArray");
let values: Vec<Vec<u8>> = (0..arr.len())
.map(|i| {
if arr.is_null(i) {
vec![]
} else {
arr.value(i).as_bytes().to_vec()
}
})
.collect();
Ok(strings_to_symbol_value(values, is_atom, attribute))
}
fn ingest_symbols_dictionary(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
macro_rules! try_dict_type {
($key_type:ty) => {{
if let Some(dict) = array
.as_any()
.downcast_ref::<arrow_array::DictionaryArray<$key_type>>()
{
let values_arr = dict.values();
let strings: Vec<Vec<u8>> = (0..dict.len())
.map(|i| {
if dict.is_null(i) {
return vec![];
}
let key_idx = dict.key(i).expect("non-null key must have value") as usize;
if let Some(s) = values_arr.as_any().downcast_ref::<StringArray>() {
s.value(key_idx).as_bytes().to_vec()
} else if let Some(s) =
values_arr.as_any().downcast_ref::<LargeStringArray>()
{
s.value(key_idx).as_bytes().to_vec()
} else if let Some(s) =
values_arr.as_any().downcast_ref::<StringViewArray>()
{
s.value(key_idx).as_bytes().to_vec()
} else {
vec![]
}
})
.collect();
return Ok(strings_to_symbol_value(strings, is_atom, attribute));
}
}};
}
try_dict_type!(arrow_array::types::Int8Type);
try_dict_type!(arrow_array::types::Int16Type);
try_dict_type!(arrow_array::types::Int32Type);
try_dict_type!(arrow_array::types::Int64Type);
try_dict_type!(arrow_array::types::UInt8Type);
try_dict_type!(arrow_array::types::UInt16Type);
try_dict_type!(arrow_array::types::UInt32Type);
try_dict_type!(arrow_array::types::UInt64Type);
Err(IngestionError::Unsupported(
"Unsupported dictionary key type for symbol ingestion".to_string(),
))
}
// ---------------------------------------------------------------------------
// Binary → Char vector (single-element binary → char vector)
// ---------------------------------------------------------------------------
fn ingest_binary_as_char(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<BinaryArray>()
.expect("Binary datatype must match BinaryArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow Binary arrays with nulls cannot be ingested as q char vectors".to_string(),
));
}
if arr.len() != 1 {
return Err(IngestionError::Unsupported(
"Multi-element Binary arrays should use List shape for q ingestion".to_string(),
));
}
let bytes = arr.value(0).to_vec();
if is_atom && bytes.len() == 1 {
Ok(Value::Atom(Atom::Char(bytes[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Char(bytes::Bytes::from(bytes)),
)))
}
}
fn ingest_large_binary_as_char(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<LargeBinaryArray>()
.expect("LargeBinary datatype must match LargeBinaryArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow LargeBinary arrays with nulls cannot be ingested as q char vectors".to_string(),
));
}
if arr.len() != 1 {
return Err(IngestionError::Unsupported(
"Multi-element LargeBinary arrays should use List shape for q ingestion".to_string(),
));
}
let bytes = arr.value(0).to_vec();
if is_atom && bytes.len() == 1 {
Ok(Value::Atom(Atom::Char(bytes[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Char(bytes::Bytes::from(bytes)),
)))
}
}
fn ingest_binary_view_as_char(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<BinaryViewArray>()
.expect("BinaryView datatype must match BinaryViewArray");
if arr.null_count() != 0 {
return Err(IngestionError::Unsupported(
"Arrow BinaryView arrays with nulls cannot be ingested as q char vectors".to_string(),
));
}
if arr.len() != 1 {
return Err(IngestionError::Unsupported(
"Multi-element BinaryView arrays should use List shape for q ingestion".to_string(),
));
}
let bytes = arr.value(0).to_vec();
if is_atom && bytes.len() == 1 {
Ok(Value::Atom(Atom::Char(bytes[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Char(bytes::Bytes::from(bytes)),
)))
}
}
// ---------------------------------------------------------------------------
// Date32 → q Date (days since 2000-01-01)
// ---------------------------------------------------------------------------
fn ingest_date32(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Date32Array>()
.expect("Date32 datatype must match Date32Array");
let mut values: Vec<i32> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_DATE;
}
}
}
for v in &mut values {
if *v != Q_NULL_DATE {
*v = v.saturating_sub(DATE_OFFSET_DAYS);
}
}
if is_atom {
Ok(Value::Atom(Atom::Date(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Date(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Timestamp → q Timestamp (ns since 2000-01-01)
// ---------------------------------------------------------------------------
fn ingest_timestamp_ns(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<TimestampNanosecondArray>()
.expect("Timestamp(Nanosecond) must match TimestampNanosecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESTAMP;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESTAMP {
*v = v.saturating_sub(TIMESTAMP_OFFSET_NS);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timestamp(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timestamp(vec_to_bytes(values)),
)))
}
}
fn ingest_timestamp_us(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<TimestampMicrosecondArray>()
.expect("Timestamp(Microsecond) must match TimestampMicrosecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESTAMP;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESTAMP {
*v = v.saturating_mul(1_000).saturating_sub(TIMESTAMP_OFFSET_NS);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timestamp(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timestamp(vec_to_bytes(values)),
)))
}
}
fn ingest_timestamp_ms(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<TimestampMillisecondArray>()
.expect("Timestamp(Millisecond) must match TimestampMillisecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESTAMP;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESTAMP {
*v = v
.saturating_mul(1_000_000)
.saturating_sub(TIMESTAMP_OFFSET_NS);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timestamp(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timestamp(vec_to_bytes(values)),
)))
}
}
fn ingest_timestamp_s(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<TimestampSecondArray>()
.expect("Timestamp(Second) must match TimestampSecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESTAMP;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESTAMP {
*v = v
.saturating_mul(1_000_000_000)
.saturating_sub(TIMESTAMP_OFFSET_NS);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timestamp(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timestamp(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Duration → q Timespan (ns)
// ---------------------------------------------------------------------------
fn ingest_duration_ns(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<DurationNanosecondArray>()
.expect("Duration(Nanosecond) must match DurationNanosecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESPAN;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Timespan(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timespan(vec_to_bytes(values)),
)))
}
}
fn ingest_duration_us(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<DurationMicrosecondArray>()
.expect("Duration(Microsecond) must match DurationMicrosecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESPAN;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESPAN {
*v = v.saturating_mul(1_000);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timespan(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timespan(vec_to_bytes(values)),
)))
}
}
fn ingest_duration_ms(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<DurationMillisecondArray>()
.expect("Duration(Millisecond) must match DurationMillisecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESPAN;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESPAN {
*v = v.saturating_mul(1_000_000);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timespan(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timespan(vec_to_bytes(values)),
)))
}
}
fn ingest_duration_s(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<DurationSecondArray>()
.expect("Duration(Second) must match DurationSecondArray");
let mut values: Vec<i64> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIMESPAN;
}
}
}
for v in &mut values {
if *v != Q_NULL_TIMESPAN {
*v = v.saturating_mul(1_000_000_000);
}
}
if is_atom {
Ok(Value::Atom(Atom::Timespan(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Timespan(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Time32(Second) → q Second or Minute
// ---------------------------------------------------------------------------
fn ingest_time32_second(
array: &ArrayRef,
prim: IngestPrimitive,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Time32SecondArray>()
.expect("Time32(Second) must match Time32SecondArray");
let mut values: Vec<i32> = arr.values().to_vec();
if prim == IngestPrimitive::Minute {
let null = Q_NULL_MINUTE;
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = null;
}
}
}
for v in &mut values {
if *v != null {
*v /= 60;
}
}
if is_atom {
Ok(Value::Atom(Atom::Minute(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Minute(vec_to_bytes(values)),
)))
}
} else {
let null = Q_NULL_SECOND;
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = null;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Second(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Second(vec_to_bytes(values)),
)))
}
}
}
// ---------------------------------------------------------------------------
// Time32(Millisecond) → q Time (ms)
// ---------------------------------------------------------------------------
fn ingest_time32_ms(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Time32MillisecondArray>()
.expect("Time32(Millisecond) must match Time32MillisecondArray");
let mut values: Vec<i32> = arr.values().to_vec();
if arr.null_count() != 0 {
for i in 0..arr.len() {
if arr.is_null(i) {
values[i] = Q_NULL_TIME;
}
}
}
if is_atom {
Ok(Value::Atom(Atom::Time(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Time(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Time64(Microsecond) → q Time (ms, truncating)
// ---------------------------------------------------------------------------
fn ingest_time64_us(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Time64MicrosecondArray>()
.expect("Time64(Microsecond) must match Time64MicrosecondArray");
let values: Vec<i32> = (0..arr.len())
.map(|i| {
if arr.is_null(i) {
Q_NULL_TIME
} else {
(arr.value(i) / 1_000).clamp(i64::from(i32::MIN), i64::from(i32::MAX)) as i32
}
})
.collect();
if is_atom {
Ok(Value::Atom(Atom::Time(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Time(vec_to_bytes(values)),
)))
}
}
// ---------------------------------------------------------------------------
// Time64(Nanosecond) → q Time (ms, truncating)
// ---------------------------------------------------------------------------
fn ingest_time64_ns(
array: &ArrayRef,
is_atom: bool,
attribute: Attribute,
) -> IngestionResult<Value> {
let arr = array
.as_any()
.downcast_ref::<Time64NanosecondArray>()
.expect("Time64(Nanosecond) must match Time64NanosecondArray");
let values: Vec<i32> = (0..arr.len())
.map(|i| {
if arr.is_null(i) {
Q_NULL_TIME
} else {
(arr.value(i) / 1_000_000).clamp(i64::from(i32::MIN), i64::from(i32::MAX)) as i32
}
})
.collect();
if is_atom {
Ok(Value::Atom(Atom::Time(values[0])))
} else {
Ok(Value::Vector(Vector::new(
attribute,
VectorData::Time(vec_to_bytes(values)),
)))
}
}