//! Arrow ingestion: converts Arrow arrays and record batches into q `Value` trees. //! //! This is the reverse direction of [`crate::projection`]. Arrow field //! metadata produced by the projection layer (`qroissant.shape`, //! `qroissant.primitive`, etc.) is consumed here so that round-trips through //! Arrow preserve exact q semantics. //! //! No PyO3 or Python dependencies are allowed in this crate; PyCapsule //! handling lives in `qroissant-python`. use arrow_array::Array; use arrow_array::ArrayRef; use arrow_array::BinaryArray; use arrow_array::BinaryViewArray; use arrow_array::BooleanArray; use arrow_array::Date32Array; use arrow_array::DurationMicrosecondArray; use arrow_array::DurationMillisecondArray; use arrow_array::DurationNanosecondArray; use arrow_array::DurationSecondArray; use arrow_array::FixedSizeBinaryArray; use arrow_array::Float32Array; use arrow_array::Float64Array; use arrow_array::Int16Array; use arrow_array::Int32Array; use arrow_array::Int64Array; use arrow_array::LargeBinaryArray; use arrow_array::LargeListArray; use arrow_array::LargeStringArray; use arrow_array::ListArray; use arrow_array::MapArray; use arrow_array::RecordBatch; use arrow_array::StringArray; use arrow_array::StringViewArray; use arrow_array::StructArray; use arrow_array::Time32MillisecondArray; use arrow_array::Time32SecondArray; use arrow_array::Time64MicrosecondArray; use arrow_array::Time64NanosecondArray; use arrow_array::TimestampMicrosecondArray; use arrow_array::TimestampMillisecondArray; use arrow_array::TimestampNanosecondArray; use arrow_array::TimestampSecondArray; use arrow_array::UInt8Array; use arrow_schema::DataType; use arrow_schema::Field as ArrowField; use arrow_schema::SchemaRef; use arrow_schema::TimeUnit; use qroissant_core::Atom; use qroissant_core::Attribute; use qroissant_core::Dictionary; use qroissant_core::List; use qroissant_core::Table; use qroissant_core::Value; use qroissant_core::Vector; use qroissant_core::VectorData; use qroissant_kernels::nulls::Q_NULL_DATE; use qroissant_kernels::nulls::Q_NULL_MINUTE; use qroissant_kernels::nulls::Q_NULL_SECOND; use qroissant_kernels::nulls::Q_NULL_SHORT; use qroissant_kernels::nulls::Q_NULL_TIME; use qroissant_kernels::nulls::Q_NULL_TIMESPAN; use qroissant_kernels::nulls::Q_NULL_TIMESTAMP; use qroissant_kernels::temporal::DATE_OFFSET_DAYS; use qroissant_kernels::temporal::TIMESTAMP_OFFSET_NS; use crate::error::IngestionError; use crate::error::IngestionResult; /// Converts a `Vec` to `bytes::Bytes` via zero-copy reinterpretation. fn vec_to_bytes(values: Vec) -> bytes::Bytes { // Safety: bytemuck::cast_vec requires NoUninit, which guarantees no padding. let byte_vec: Vec = bytemuck::allocation::cast_vec(values); bytes::Bytes::from(byte_vec) } use crate::metadata::ATTRIBUTE_KEY; use crate::metadata::PRIMITIVE_KEY; use crate::metadata::SHAPE_KEY; use crate::metadata::SORTED_KEY; // --------------------------------------------------------------------------- // Metadata hint extraction // --------------------------------------------------------------------------- #[derive(Clone, Copy, Default, Debug)] struct IngestHint { shape: Option, primitive: Option, attribute: Option, sorted: Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum IngestShape { Atom, Vector, List, Dictionary, Table, UnaryPrimitive, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum IngestPrimitive { Boolean, Guid, Byte, Short, Int, Long, Real, Float, Char, Symbol, Timestamp, Month, Date, Datetime, Timespan, Minute, Second, Time, } fn hint_from_field(field: &ArrowField) -> IngestHint { let meta = field.metadata(); IngestHint { shape: meta.get(SHAPE_KEY).and_then(|s| parse_shape(s)), primitive: meta.get(PRIMITIVE_KEY).and_then(|s| parse_primitive(s)), attribute: meta.get(ATTRIBUTE_KEY).and_then(|s| parse_attribute(s)), sorted: meta.get(SORTED_KEY).and_then(|s| s.parse::().ok()), } } fn parse_shape(s: &str) -> Option { match s { "atom" => Some(IngestShape::Atom), "vector" => Some(IngestShape::Vector), "list" => Some(IngestShape::List), "dictionary" => Some(IngestShape::Dictionary), "table" => Some(IngestShape::Table), "unary_primitive" => Some(IngestShape::UnaryPrimitive), _ => None, } } fn parse_primitive(s: &str) -> Option { match s { "boolean" => Some(IngestPrimitive::Boolean), "guid" => Some(IngestPrimitive::Guid), "byte" => Some(IngestPrimitive::Byte), "short" => Some(IngestPrimitive::Short), "int" => Some(IngestPrimitive::Int), "long" => Some(IngestPrimitive::Long), "real" => Some(IngestPrimitive::Real), "float" => Some(IngestPrimitive::Float), "char" => Some(IngestPrimitive::Char), "symbol" => Some(IngestPrimitive::Symbol), "timestamp" => Some(IngestPrimitive::Timestamp), "month" => Some(IngestPrimitive::Month), "date" => Some(IngestPrimitive::Date), "datetime" => Some(IngestPrimitive::Datetime), "timespan" => Some(IngestPrimitive::Timespan), "minute" => Some(IngestPrimitive::Minute), "second" => Some(IngestPrimitive::Second), "time" => Some(IngestPrimitive::Time), _ => None, } } fn parse_attribute(s: &str) -> Option { match s { "none" => Some(Attribute::None), "sorted" => Some(Attribute::Sorted), "unique" => Some(Attribute::Unique), "parted" => Some(Attribute::Parted), "grouped" => Some(Attribute::Grouped), _ => None, } } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /// Convert an Arrow array + field descriptor into a q `Value`. pub fn ingest_array(array: ArrayRef, field: &ArrowField) -> IngestionResult { let hint = hint_from_field(field); ingest_with_hint(array, hint) } /// Convert an Arrow record batch into a q table `Value`. pub fn ingest_record_batch(batch: RecordBatch) -> IngestionResult { let schema = batch.schema(); let mut column_names = Vec::with_capacity(batch.num_columns()); let mut columns = Vec::with_capacity(batch.num_columns()); for (index, field) in schema.fields().iter().enumerate() { column_names.push(bytes::Bytes::copy_from_slice(field.name().as_bytes())); columns.push(ingest_array(batch.column(index).clone(), field.as_ref())?); } let table = Table::new(Attribute::None, column_names, columns); table .validate() .map_err(|e| IngestionError::Unsupported(e.to_string()))?; Ok(Value::Table(table)) } /// Convert a sequence of record batches (a stream) into a q table `Value`. /// /// All batches must share the same schema. The batches are concatenated using /// `arrow_select::concat::concat_batches` before ingestion. pub fn ingest_record_batch_reader( schema: SchemaRef, batches: impl IntoIterator>, ) -> IngestionResult { let batches: Vec = batches.into_iter().collect::>()?; if batches.is_empty() { // Produce an empty table with the correct schema. let column_names: Vec = schema .fields() .iter() .map(|f| bytes::Bytes::copy_from_slice(f.name().as_bytes())) .collect(); let columns: Vec = schema .fields() .iter() .map(|f| ingest_array(arrow_array::new_empty_array(f.data_type()), f.as_ref())) .collect::>()?; let table = Table::new(Attribute::None, column_names, columns); return Ok(Value::Table(table)); } let merged = arrow_select::concat::concat_batches(&schema, &batches)?; ingest_record_batch(merged) } // --------------------------------------------------------------------------- // Main dispatch // --------------------------------------------------------------------------- fn ingest_with_hint(array: ArrayRef, hint: IngestHint) -> IngestionResult { let shape = hint .shape .unwrap_or_else(|| default_shape(array.data_type())); match shape { IngestShape::UnaryPrimitive => { if array.len() != 1 { return Err(IngestionError::Unsupported(format!( "unary_primitive shape requires length 1, got {}", array.len() ))); } Ok(Value::UnaryPrimitive { opcode: -128 }) } IngestShape::Table => ingest_table(array, hint), IngestShape::Dictionary => ingest_dictionary(array, hint), IngestShape::List => ingest_list(array, hint), IngestShape::Atom | IngestShape::Vector => ingest_scalar_or_vector(array, shape, hint), } } fn default_shape(dt: &DataType) -> IngestShape { match dt { DataType::Null => IngestShape::List, DataType::List(_) | DataType::LargeList(_) => IngestShape::List, // Multiple binary blobs default to a list of char vectors. // Use explicit metadata (qroissant.shape=vector) for char vector. DataType::Binary | DataType::LargeBinary | DataType::BinaryView => IngestShape::List, DataType::Map(_, _) => IngestShape::Dictionary, DataType::Struct(_) => IngestShape::Table, _ => IngestShape::Vector, } } // --------------------------------------------------------------------------- // Table ingestion (Struct array) // --------------------------------------------------------------------------- fn ingest_table(array: ArrayRef, hint: IngestHint) -> IngestionResult { let attribute = hint.attribute.unwrap_or(Attribute::None); let struct_array = array .as_any() .downcast_ref::() .ok_or_else(|| { IngestionError::Unsupported(format!( "q table ingestion requires a StructArray, found {}", array.data_type() )) })?; let fields = match array.data_type() { DataType::Struct(fields) => fields.clone(), other => { return Err(IngestionError::Unsupported(format!( "q table ingestion requires a struct field, found {other}" ))); } }; let mut column_names = Vec::with_capacity(fields.len()); let mut columns = Vec::with_capacity(fields.len()); for (i, child_field) in fields.iter().enumerate() { column_names.push(bytes::Bytes::copy_from_slice(child_field.name().as_bytes())); columns.push(ingest_array( struct_array.column(i).clone(), child_field.as_ref(), )?); } let table = Table::new(attribute, column_names, columns); table .validate() .map_err(|e| IngestionError::Unsupported(e.to_string()))?; Ok(Value::Table(table)) } // --------------------------------------------------------------------------- // Dictionary ingestion (Map array) // --------------------------------------------------------------------------- fn ingest_dictionary(array: ArrayRef, hint: IngestHint) -> IngestionResult { let map_array = array.as_any().downcast_ref::().ok_or_else(|| { IngestionError::Unsupported(format!( "q dictionary ingestion requires a MapArray, found {}", array.data_type() )) })?; if map_array.len() != 1 || map_array.is_null(0) { return Err(IngestionError::Unsupported( "q dictionary ingestion requires a non-null length-1 Arrow map".to_string(), )); } let entries = map_array.value(0); let sorted = hint.sorted.unwrap_or(false); let entry_fields = entries.fields().clone(); let keys = ingest_array(entries.column(0).clone(), entry_fields[0].as_ref())?; let values = ingest_array(entries.column(1).clone(), entry_fields[1].as_ref())?; let dict = Dictionary::new(sorted, keys, values); dict.validate() .map_err(|e| IngestionError::Unsupported(e.to_string()))?; Ok(Value::Dictionary(dict)) } // --------------------------------------------------------------------------- // List ingestion (List / LargeList / Binary / BinaryView arrays) // --------------------------------------------------------------------------- fn ingest_list(array: ArrayRef, hint: IngestHint) -> IngestionResult { let attribute = hint.attribute.unwrap_or(Attribute::None); match array.data_type() { DataType::Null => { let values = (0..array.len()) .map(|_| Value::UnaryPrimitive { opcode: -128 }) .collect(); Ok(Value::List(List::new(attribute, values))) } DataType::List(child_field) => { let child_field = child_field.clone(); let list_array = array .as_any() .downcast_ref::() .expect("List datatype must match ListArray"); let mut values = Vec::with_capacity(list_array.len()); for i in 0..list_array.len() { let child = list_array.value(i); values.push(ingest_array(child, child_field.as_ref())?); } Ok(Value::List(List::new(attribute, values))) } DataType::LargeList(child_field) => { let child_field = child_field.clone(); let list_array = array .as_any() .downcast_ref::() .expect("LargeList datatype must match LargeListArray"); let mut values = Vec::with_capacity(list_array.len()); for i in 0..list_array.len() { let child = list_array.value(i); values.push(ingest_array(child, child_field.as_ref())?); } Ok(Value::List(List::new(attribute, values))) } DataType::Binary => { let binary = array .as_any() .downcast_ref::() .expect("Binary datatype must match BinaryArray"); let values = (0..binary.len()) .map(|i| { Value::Vector(Vector::new( Attribute::None, VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))), )) }) .collect(); Ok(Value::List(List::new(attribute, values))) } DataType::LargeBinary => { let binary = array .as_any() .downcast_ref::() .expect("LargeBinary datatype must match LargeBinaryArray"); let values = (0..binary.len()) .map(|i| { Value::Vector(Vector::new( Attribute::None, VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))), )) }) .collect(); Ok(Value::List(List::new(attribute, values))) } DataType::BinaryView => { let binary = array .as_any() .downcast_ref::() .expect("BinaryView datatype must match BinaryViewArray"); let values = (0..binary.len()) .map(|i| { Value::Vector(Vector::new( Attribute::None, VectorData::Char(bytes::Bytes::copy_from_slice(binary.value(i))), )) }) .collect(); Ok(Value::List(List::new(attribute, values))) } other => Err(IngestionError::Unsupported(format!( "q list ingestion from Arrow data type {other} is not supported" ))), } } // --------------------------------------------------------------------------- // Scalar / vector ingestion // --------------------------------------------------------------------------- fn ingest_scalar_or_vector( array: ArrayRef, shape: IngestShape, hint: IngestHint, ) -> IngestionResult { let attribute = hint.attribute.unwrap_or(Attribute::None); let is_atom = shape == IngestShape::Atom; if is_atom && array.len() != 1 { return Err(IngestionError::Unsupported(format!( "q atom shape requested but Arrow array has length {}", array.len() ))); } match array.data_type() { DataType::Boolean => ingest_boolean(&array, is_atom, attribute), DataType::UInt8 => { let prim = hint.primitive.unwrap_or(IngestPrimitive::Byte); ingest_u8(&array, prim, is_atom, attribute) } DataType::Int16 => ingest_i16(&array, is_atom, attribute), DataType::Int32 => { let prim = hint.primitive.unwrap_or(IngestPrimitive::Int); ingest_i32(&array, prim, is_atom, attribute) } DataType::Int64 => ingest_i64(&array, is_atom, attribute), DataType::Float32 => ingest_f32(&array, is_atom, attribute), DataType::Float64 => { let prim = hint.primitive.unwrap_or(IngestPrimitive::Float); ingest_f64(&array, prim, is_atom, attribute) } DataType::FixedSizeBinary(1) => { let prim = hint.primitive.unwrap_or(IngestPrimitive::Char); ingest_fixed_binary_1(&array, prim, is_atom, attribute) } DataType::FixedSizeBinary(16) => ingest_fixed_binary_16(&array, is_atom, attribute), DataType::Utf8 => ingest_symbols_utf8(&array, is_atom, attribute), DataType::LargeUtf8 => ingest_symbols_large_utf8(&array, is_atom, attribute), DataType::Utf8View => ingest_symbols_utf8_view(&array, is_atom, attribute), DataType::Dictionary(_, _) => ingest_symbols_dictionary(&array, is_atom, attribute), DataType::Binary => ingest_binary_as_char(&array, is_atom, attribute), DataType::LargeBinary => ingest_large_binary_as_char(&array, is_atom, attribute), DataType::BinaryView => ingest_binary_view_as_char(&array, is_atom, attribute), DataType::Date32 => ingest_date32(&array, is_atom, attribute), DataType::Timestamp(TimeUnit::Nanosecond, tz) => { if tz.is_some() { return Err(IngestionError::Unsupported( "Arrow timestamps with timezone cannot be ingested into q".to_string(), )); } ingest_timestamp_ns(&array, is_atom, attribute) } DataType::Timestamp(TimeUnit::Microsecond, tz) => { if tz.is_some() { return Err(IngestionError::Unsupported( "Arrow timestamps with timezone cannot be ingested into q".to_string(), )); } ingest_timestamp_us(&array, is_atom, attribute) } DataType::Timestamp(TimeUnit::Millisecond, tz) => { if tz.is_some() { return Err(IngestionError::Unsupported( "Arrow timestamps with timezone cannot be ingested into q".to_string(), )); } ingest_timestamp_ms(&array, is_atom, attribute) } DataType::Timestamp(TimeUnit::Second, tz) => { if tz.is_some() { return Err(IngestionError::Unsupported( "Arrow timestamps with timezone cannot be ingested into q".to_string(), )); } ingest_timestamp_s(&array, is_atom, attribute) } DataType::Duration(TimeUnit::Nanosecond) => ingest_duration_ns(&array, is_atom, attribute), DataType::Duration(TimeUnit::Microsecond) => ingest_duration_us(&array, is_atom, attribute), DataType::Duration(TimeUnit::Millisecond) => ingest_duration_ms(&array, is_atom, attribute), DataType::Duration(TimeUnit::Second) => ingest_duration_s(&array, is_atom, attribute), DataType::Time32(TimeUnit::Second) => { let prim = hint.primitive.unwrap_or(IngestPrimitive::Second); ingest_time32_second(&array, prim, is_atom, attribute) } DataType::Time32(TimeUnit::Millisecond) => ingest_time32_ms(&array, is_atom, attribute), DataType::Time64(TimeUnit::Microsecond) => ingest_time64_us(&array, is_atom, attribute), DataType::Time64(TimeUnit::Nanosecond) => ingest_time64_ns(&array, is_atom, attribute), other => Err(IngestionError::Unsupported(format!( "q ingestion from Arrow data type {other} is not supported" ))), } } // --------------------------------------------------------------------------- // Boolean // --------------------------------------------------------------------------- fn ingest_boolean(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Boolean datatype must match BooleanArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow boolean arrays with nulls cannot be ingested as q boolean vectors; \ use a general list shape instead" .to_string(), )); } let values: Vec = (0..arr.len()) .map(|i| if arr.value(i) { 1 } else { 0 }) .collect(); if is_atom { Ok(Value::Atom(Atom::Boolean(values[0] != 0))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Boolean(bytes::Bytes::from(values)), ))) } } // --------------------------------------------------------------------------- // UInt8 (Byte or Char) // --------------------------------------------------------------------------- fn ingest_u8( array: &ArrayRef, prim: IngestPrimitive, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("UInt8 datatype must match UInt8Array"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow UInt8 arrays with nulls cannot be ingested as q byte/char".to_string(), )); } let values: Vec = arr.values().to_vec(); match prim { IngestPrimitive::Char => { if is_atom { Ok(Value::Atom(Atom::Char(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Char(bytes::Bytes::from(values)), ))) } } _ => { if is_atom { Ok(Value::Atom(Atom::Byte(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Byte(bytes::Bytes::from(values)), ))) } } } } // --------------------------------------------------------------------------- // Int16 (Short) // --------------------------------------------------------------------------- fn ingest_i16(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Int16 datatype must match Int16Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_SHORT; } } } if is_atom { Ok(Value::Atom(Atom::Short(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Short(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Int32 (Int, Month, Date, Minute, Second, Time) // --------------------------------------------------------------------------- fn ingest_i32( array: &ArrayRef, prim: IngestPrimitive, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Int32 datatype must match Int32Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { let null_sentinel = i32::MIN; // all i32 q types share i32::MIN as null for i in 0..arr.len() { if arr.is_null(i) { values[i] = null_sentinel; } } } if is_atom { let v = values[0]; let atom = match prim { IngestPrimitive::Month => Atom::Month(v), IngestPrimitive::Date => Atom::Date(v), IngestPrimitive::Minute => Atom::Minute(v), IngestPrimitive::Second => Atom::Second(v), IngestPrimitive::Time => Atom::Time(v), _ => Atom::Int(v), }; Ok(Value::Atom(atom)) } else { let bytes = vec_to_bytes(values); let data = match prim { IngestPrimitive::Month => VectorData::Month(bytes), IngestPrimitive::Date => VectorData::Date(bytes), IngestPrimitive::Minute => VectorData::Minute(bytes), IngestPrimitive::Second => VectorData::Second(bytes), IngestPrimitive::Time => VectorData::Time(bytes), _ => VectorData::Int(bytes), }; Ok(Value::Vector(Vector::new(attribute, data))) } } // --------------------------------------------------------------------------- // Int64 (Long) // --------------------------------------------------------------------------- fn ingest_i64(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Int64 datatype must match Int64Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = i64::MIN; } } } if is_atom { Ok(Value::Atom(Atom::Long(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Long(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Float32 (Real) // --------------------------------------------------------------------------- fn ingest_f32(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Float32 datatype must match Float32Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = f32::NAN; } } } if is_atom { Ok(Value::Atom(Atom::Real(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Real(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Float64 (Float, Datetime) // --------------------------------------------------------------------------- fn ingest_f64( array: &ArrayRef, prim: IngestPrimitive, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Float64 datatype must match Float64Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = f64::NAN; } } } if is_atom { let v = values[0]; let atom = match prim { IngestPrimitive::Datetime => Atom::Datetime(v), _ => Atom::Float(v), }; Ok(Value::Atom(atom)) } else { let bytes = vec_to_bytes(values); let data = match prim { IngestPrimitive::Datetime => VectorData::Datetime(bytes), _ => VectorData::Float(bytes), }; Ok(Value::Vector(Vector::new(attribute, data))) } } // --------------------------------------------------------------------------- // FixedSizeBinary(1) – Char or Byte // --------------------------------------------------------------------------- fn ingest_fixed_binary_1( array: &ArrayRef, prim: IngestPrimitive, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("FixedSizeBinary(1) datatype must match FixedSizeBinaryArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow FixedSizeBinary(1) arrays with nulls cannot be ingested as q char/byte" .to_string(), )); } let values: Vec = (0..arr.len()).map(|i| arr.value(i)[0]).collect(); match prim { IngestPrimitive::Byte => { if is_atom { Ok(Value::Atom(Atom::Byte(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Byte(bytes::Bytes::from(values)), ))) } } _ => { if is_atom { Ok(Value::Atom(Atom::Char(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Char(bytes::Bytes::from(values)), ))) } } } } // --------------------------------------------------------------------------- // FixedSizeBinary(16) – Guid // --------------------------------------------------------------------------- fn ingest_fixed_binary_16( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("FixedSizeBinary(16) datatype must match FixedSizeBinaryArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow FixedSizeBinary(16) arrays with nulls cannot be ingested as q guid".to_string(), )); } let values: Vec<[u8; 16]> = (0..arr.len()) .map(|i| { let mut buf = [0u8; 16]; buf.copy_from_slice(arr.value(i)); buf }) .collect(); if is_atom { Ok(Value::Atom(Atom::Guid(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::from_guids(&values), ))) } } // --------------------------------------------------------------------------- // Symbol (various string types) // --------------------------------------------------------------------------- fn strings_to_symbol_value(strings: Vec>, is_atom: bool, attribute: Attribute) -> Value { if is_atom { Value::Atom(Atom::Symbol(bytes::Bytes::from( strings.into_iter().next().unwrap_or_default(), ))) } else { Value::Vector(Vector::new( attribute, VectorData::Symbol(strings.into_iter().map(bytes::Bytes::from).collect()), )) } } fn ingest_symbols_utf8( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Utf8 datatype must match StringArray"); let values: Vec> = (0..arr.len()) .map(|i| { if arr.is_null(i) { vec![] } else { arr.value(i).as_bytes().to_vec() } }) .collect(); Ok(strings_to_symbol_value(values, is_atom, attribute)) } fn ingest_symbols_large_utf8( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("LargeUtf8 datatype must match LargeStringArray"); let values: Vec> = (0..arr.len()) .map(|i| { if arr.is_null(i) { vec![] } else { arr.value(i).as_bytes().to_vec() } }) .collect(); Ok(strings_to_symbol_value(values, is_atom, attribute)) } fn ingest_symbols_utf8_view( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Utf8View datatype must match StringViewArray"); let values: Vec> = (0..arr.len()) .map(|i| { if arr.is_null(i) { vec![] } else { arr.value(i).as_bytes().to_vec() } }) .collect(); Ok(strings_to_symbol_value(values, is_atom, attribute)) } fn ingest_symbols_dictionary( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { macro_rules! try_dict_type { ($key_type:ty) => {{ if let Some(dict) = array .as_any() .downcast_ref::>() { let values_arr = dict.values(); let strings: Vec> = (0..dict.len()) .map(|i| { if dict.is_null(i) { return vec![]; } let key_idx = dict.key(i).expect("non-null key must have value") as usize; if let Some(s) = values_arr.as_any().downcast_ref::() { s.value(key_idx).as_bytes().to_vec() } else if let Some(s) = values_arr.as_any().downcast_ref::() { s.value(key_idx).as_bytes().to_vec() } else if let Some(s) = values_arr.as_any().downcast_ref::() { s.value(key_idx).as_bytes().to_vec() } else { vec![] } }) .collect(); return Ok(strings_to_symbol_value(strings, is_atom, attribute)); } }}; } try_dict_type!(arrow_array::types::Int8Type); try_dict_type!(arrow_array::types::Int16Type); try_dict_type!(arrow_array::types::Int32Type); try_dict_type!(arrow_array::types::Int64Type); try_dict_type!(arrow_array::types::UInt8Type); try_dict_type!(arrow_array::types::UInt16Type); try_dict_type!(arrow_array::types::UInt32Type); try_dict_type!(arrow_array::types::UInt64Type); Err(IngestionError::Unsupported( "Unsupported dictionary key type for symbol ingestion".to_string(), )) } // --------------------------------------------------------------------------- // Binary → Char vector (single-element binary → char vector) // --------------------------------------------------------------------------- fn ingest_binary_as_char( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Binary datatype must match BinaryArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow Binary arrays with nulls cannot be ingested as q char vectors".to_string(), )); } if arr.len() != 1 { return Err(IngestionError::Unsupported( "Multi-element Binary arrays should use List shape for q ingestion".to_string(), )); } let bytes = arr.value(0).to_vec(); if is_atom && bytes.len() == 1 { Ok(Value::Atom(Atom::Char(bytes[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Char(bytes::Bytes::from(bytes)), ))) } } fn ingest_large_binary_as_char( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("LargeBinary datatype must match LargeBinaryArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow LargeBinary arrays with nulls cannot be ingested as q char vectors".to_string(), )); } if arr.len() != 1 { return Err(IngestionError::Unsupported( "Multi-element LargeBinary arrays should use List shape for q ingestion".to_string(), )); } let bytes = arr.value(0).to_vec(); if is_atom && bytes.len() == 1 { Ok(Value::Atom(Atom::Char(bytes[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Char(bytes::Bytes::from(bytes)), ))) } } fn ingest_binary_view_as_char( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("BinaryView datatype must match BinaryViewArray"); if arr.null_count() != 0 { return Err(IngestionError::Unsupported( "Arrow BinaryView arrays with nulls cannot be ingested as q char vectors".to_string(), )); } if arr.len() != 1 { return Err(IngestionError::Unsupported( "Multi-element BinaryView arrays should use List shape for q ingestion".to_string(), )); } let bytes = arr.value(0).to_vec(); if is_atom && bytes.len() == 1 { Ok(Value::Atom(Atom::Char(bytes[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Char(bytes::Bytes::from(bytes)), ))) } } // --------------------------------------------------------------------------- // Date32 → q Date (days since 2000-01-01) // --------------------------------------------------------------------------- fn ingest_date32(array: &ArrayRef, is_atom: bool, attribute: Attribute) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Date32 datatype must match Date32Array"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_DATE; } } } for v in &mut values { if *v != Q_NULL_DATE { *v = v.saturating_sub(DATE_OFFSET_DAYS); } } if is_atom { Ok(Value::Atom(Atom::Date(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Date(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Timestamp → q Timestamp (ns since 2000-01-01) // --------------------------------------------------------------------------- fn ingest_timestamp_ns( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Timestamp(Nanosecond) must match TimestampNanosecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESTAMP; } } } for v in &mut values { if *v != Q_NULL_TIMESTAMP { *v = v.saturating_sub(TIMESTAMP_OFFSET_NS); } } if is_atom { Ok(Value::Atom(Atom::Timestamp(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timestamp(vec_to_bytes(values)), ))) } } fn ingest_timestamp_us( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Timestamp(Microsecond) must match TimestampMicrosecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESTAMP; } } } for v in &mut values { if *v != Q_NULL_TIMESTAMP { *v = v.saturating_mul(1_000).saturating_sub(TIMESTAMP_OFFSET_NS); } } if is_atom { Ok(Value::Atom(Atom::Timestamp(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timestamp(vec_to_bytes(values)), ))) } } fn ingest_timestamp_ms( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Timestamp(Millisecond) must match TimestampMillisecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESTAMP; } } } for v in &mut values { if *v != Q_NULL_TIMESTAMP { *v = v .saturating_mul(1_000_000) .saturating_sub(TIMESTAMP_OFFSET_NS); } } if is_atom { Ok(Value::Atom(Atom::Timestamp(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timestamp(vec_to_bytes(values)), ))) } } fn ingest_timestamp_s( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Timestamp(Second) must match TimestampSecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESTAMP; } } } for v in &mut values { if *v != Q_NULL_TIMESTAMP { *v = v .saturating_mul(1_000_000_000) .saturating_sub(TIMESTAMP_OFFSET_NS); } } if is_atom { Ok(Value::Atom(Atom::Timestamp(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timestamp(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Duration → q Timespan (ns) // --------------------------------------------------------------------------- fn ingest_duration_ns( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Duration(Nanosecond) must match DurationNanosecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESPAN; } } } if is_atom { Ok(Value::Atom(Atom::Timespan(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timespan(vec_to_bytes(values)), ))) } } fn ingest_duration_us( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Duration(Microsecond) must match DurationMicrosecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESPAN; } } } for v in &mut values { if *v != Q_NULL_TIMESPAN { *v = v.saturating_mul(1_000); } } if is_atom { Ok(Value::Atom(Atom::Timespan(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timespan(vec_to_bytes(values)), ))) } } fn ingest_duration_ms( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Duration(Millisecond) must match DurationMillisecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESPAN; } } } for v in &mut values { if *v != Q_NULL_TIMESPAN { *v = v.saturating_mul(1_000_000); } } if is_atom { Ok(Value::Atom(Atom::Timespan(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timespan(vec_to_bytes(values)), ))) } } fn ingest_duration_s( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Duration(Second) must match DurationSecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIMESPAN; } } } for v in &mut values { if *v != Q_NULL_TIMESPAN { *v = v.saturating_mul(1_000_000_000); } } if is_atom { Ok(Value::Atom(Atom::Timespan(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Timespan(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Time32(Second) → q Second or Minute // --------------------------------------------------------------------------- fn ingest_time32_second( array: &ArrayRef, prim: IngestPrimitive, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Time32(Second) must match Time32SecondArray"); let mut values: Vec = arr.values().to_vec(); if prim == IngestPrimitive::Minute { let null = Q_NULL_MINUTE; if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = null; } } } for v in &mut values { if *v != null { *v /= 60; } } if is_atom { Ok(Value::Atom(Atom::Minute(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Minute(vec_to_bytes(values)), ))) } } else { let null = Q_NULL_SECOND; if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = null; } } } if is_atom { Ok(Value::Atom(Atom::Second(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Second(vec_to_bytes(values)), ))) } } } // --------------------------------------------------------------------------- // Time32(Millisecond) → q Time (ms) // --------------------------------------------------------------------------- fn ingest_time32_ms( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Time32(Millisecond) must match Time32MillisecondArray"); let mut values: Vec = arr.values().to_vec(); if arr.null_count() != 0 { for i in 0..arr.len() { if arr.is_null(i) { values[i] = Q_NULL_TIME; } } } if is_atom { Ok(Value::Atom(Atom::Time(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Time(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Time64(Microsecond) → q Time (ms, truncating) // --------------------------------------------------------------------------- fn ingest_time64_us( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Time64(Microsecond) must match Time64MicrosecondArray"); let values: Vec = (0..arr.len()) .map(|i| { if arr.is_null(i) { Q_NULL_TIME } else { (arr.value(i) / 1_000).clamp(i64::from(i32::MIN), i64::from(i32::MAX)) as i32 } }) .collect(); if is_atom { Ok(Value::Atom(Atom::Time(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Time(vec_to_bytes(values)), ))) } } // --------------------------------------------------------------------------- // Time64(Nanosecond) → q Time (ms, truncating) // --------------------------------------------------------------------------- fn ingest_time64_ns( array: &ArrayRef, is_atom: bool, attribute: Attribute, ) -> IngestionResult { let arr = array .as_any() .downcast_ref::() .expect("Time64(Nanosecond) must match Time64NanosecondArray"); let values: Vec = (0..arr.len()) .map(|i| { if arr.is_null(i) { Q_NULL_TIME } else { (arr.value(i) / 1_000_000).clamp(i64::from(i32::MIN), i64::from(i32::MAX)) as i32 } }) .collect(); if is_atom { Ok(Value::Atom(Atom::Time(values[0]))) } else { Ok(Value::Vector(Vector::new( attribute, VectorData::Time(vec_to_bytes(values)), ))) } }