rsbag_arrow: converting rosbag to parqeut
This commit is contained in:
		
							parent
							
								
									92868a954b
								
							
						
					
					
						commit
						2b20469859
					
				@ -2,6 +2,8 @@
 | 
			
		||||
 | 
			
		||||
members = [
 | 
			
		||||
    "rsbag",
 | 
			
		||||
    "rsbagpy",
 | 
			
		||||
    "rsbag_arrow"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[profile.release]
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,7 @@
 | 
			
		||||
[package]
 | 
			
		||||
name = "rsbag"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
edition = "2018"
 | 
			
		||||
edition = "2021"
 | 
			
		||||
 | 
			
		||||
[features]
 | 
			
		||||
default = ["mmap", "rayon", "bson"]
 | 
			
		||||
@ -16,7 +16,7 @@ lz4_flex = { version = "0.8.2", default-features = false, features = ["std", "ch
 | 
			
		||||
memmap = { version = "0.7.0", optional = true }
 | 
			
		||||
nom = "7.0.0"
 | 
			
		||||
num_enum = "0.5.4"
 | 
			
		||||
rayon = { version = "1.5.1", optional = true }
 | 
			
		||||
rayon = { version = "1.6.1", optional = true }
 | 
			
		||||
regex = "1.5.4"
 | 
			
		||||
ros_message = "0.1.0"
 | 
			
		||||
smallvec = "1.6.1"
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,7 @@
 | 
			
		||||
use std::{fs::File, path::Path};
 | 
			
		||||
 | 
			
		||||
use eyre::Context;
 | 
			
		||||
use rayon::iter::ParallelIterator;
 | 
			
		||||
use rayon::prelude::{IndexedParallelIterator, ParallelIterator};
 | 
			
		||||
 | 
			
		||||
use crate::{
 | 
			
		||||
    chunk::{read_chunks_data, read_chunks_messages, MessageData},
 | 
			
		||||
@ -31,6 +31,10 @@ impl Bag {
 | 
			
		||||
        &self.index
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn reader(&self) -> &MmapReader {
 | 
			
		||||
        &self.reader
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn compute_message_layouts(&mut self) -> Result<MessageLayouts> {
 | 
			
		||||
        MessageLayouts::new(&self.index.connections)
 | 
			
		||||
    }
 | 
			
		||||
@ -39,7 +43,7 @@ impl Bag {
 | 
			
		||||
        BagInfo::compute(self.reader.clone(), &self.index)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn read_chunks(&mut self) -> impl ParallelIterator<Item = Result<Vec<u8>>> + '_ {
 | 
			
		||||
    pub fn read_chunks(&mut self) -> impl IndexedParallelIterator<Item = Result<Vec<u8>>> + '_ {
 | 
			
		||||
        read_chunks_data(self.reader.clone(), &self.index.chunks)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -6,7 +6,10 @@ use std::{
 | 
			
		||||
 | 
			
		||||
use bytes::Bytes;
 | 
			
		||||
use eyre::{bail, eyre, Context};
 | 
			
		||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
 | 
			
		||||
use rayon::{
 | 
			
		||||
    iter::{IntoParallelIterator, ParallelIterator},
 | 
			
		||||
    prelude::IndexedParallelIterator,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
use crate::{
 | 
			
		||||
    error,
 | 
			
		||||
@ -94,10 +97,11 @@ pub fn read_chunk_data_at<R: BagReader + io::Seek>(
 | 
			
		||||
pub fn read_chunks_data<'a, R, C>(
 | 
			
		||||
    reader: R,
 | 
			
		||||
    chunks: C,
 | 
			
		||||
) -> impl ParallelIterator<Item = Result<Vec<u8>>> + 'a
 | 
			
		||||
) -> impl IndexedParallelIterator<Item = Result<Vec<u8>>> + 'a
 | 
			
		||||
where
 | 
			
		||||
    R: BagReader + io::Seek + Clone + Send + 'a,
 | 
			
		||||
    C: IntoParallelIterator<Item = &'a ChunkInfo> + 'a,
 | 
			
		||||
    <C as IntoParallelIterator>::Iter: IndexedParallelIterator,
 | 
			
		||||
{
 | 
			
		||||
    chunks
 | 
			
		||||
        .into_par_iter()
 | 
			
		||||
@ -139,6 +143,7 @@ pub fn read_chunks_messages<'a, R, C>(
 | 
			
		||||
where
 | 
			
		||||
    R: BagReader + io::Seek + Clone + Send + 'a,
 | 
			
		||||
    C: IntoParallelIterator<Item = &'a ChunkInfo> + 'a,
 | 
			
		||||
    <C as IntoParallelIterator>::Iter: IndexedParallelIterator,
 | 
			
		||||
{
 | 
			
		||||
    read_chunks_data(reader, chunks).flat_map_iter(move |data| ChunkMessageIterator {
 | 
			
		||||
        reader: data.map(|data| BytesReader::from(Bytes::from(data))),
 | 
			
		||||
 | 
			
		||||
@ -38,7 +38,7 @@ impl ConnInfo {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug)]
 | 
			
		||||
#[derive(Clone, Debug)]
 | 
			
		||||
pub struct ChunkInfo {
 | 
			
		||||
    pub pos: u64,
 | 
			
		||||
    pub start_time: Time,
 | 
			
		||||
@ -70,7 +70,7 @@ impl ChunkInfo {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug)]
 | 
			
		||||
#[derive(Clone, Debug)]
 | 
			
		||||
pub struct ChunkConnection {
 | 
			
		||||
    pub conn_id: u32,
 | 
			
		||||
    pub count: u32,
 | 
			
		||||
@ -88,7 +88,7 @@ impl ChunkConnection {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug)]
 | 
			
		||||
#[derive(Clone, Debug)]
 | 
			
		||||
pub struct BagIndex {
 | 
			
		||||
    pub connections: Vec<ConnInfo>,
 | 
			
		||||
    pub chunks: Vec<ChunkInfo>,
 | 
			
		||||
 | 
			
		||||
@ -114,7 +114,7 @@ impl FieldLayout {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, Debug, PartialEq, Eq)]
 | 
			
		||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 | 
			
		||||
pub enum Multiplicity {
 | 
			
		||||
    Unit,
 | 
			
		||||
    Fixed(usize),
 | 
			
		||||
 | 
			
		||||
@ -52,9 +52,9 @@ pub fn compute_layout(conn: &ConnInfo) -> Result<MessageLayout> {
 | 
			
		||||
    MessageLayout::from_msgs(&msgs)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct ConnData {
 | 
			
		||||
    info: Arc<ConnInfo>,
 | 
			
		||||
    layout: MessageLayout,
 | 
			
		||||
pub struct ConnData {
 | 
			
		||||
    pub info: Arc<ConnInfo>,
 | 
			
		||||
    pub layout: MessageLayout,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl ConnData {
 | 
			
		||||
@ -67,7 +67,7 @@ impl ConnData {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
pub struct MessageLayouts {
 | 
			
		||||
    conns: HashMap<u32, ConnData>,
 | 
			
		||||
    pub conns: HashMap<u32, ConnData>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl MessageLayouts {
 | 
			
		||||
@ -79,6 +79,10 @@ impl MessageLayouts {
 | 
			
		||||
        Ok(MessageLayouts { conns })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn get(&self, conn_id: u32) -> Option<&MessageLayout> {
 | 
			
		||||
        self.conns.get(&conn_id).map(|conn_data| &conn_data.layout)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn decode(&self, message: &MessageData) -> Result<(Arc<ConnInfo>, MessageValue)> {
 | 
			
		||||
        let conn = self
 | 
			
		||||
            .conns
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,7 @@ use crate::{parse, Result};
 | 
			
		||||
 | 
			
		||||
const READ_SIZE: usize = 4096;
 | 
			
		||||
 | 
			
		||||
#[derive(Clone)]
 | 
			
		||||
pub struct IoReader<R> {
 | 
			
		||||
    read: R,
 | 
			
		||||
    buffer: BytesMut,
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										20
									
								
								rsbag_arrow/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								rsbag_arrow/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,20 @@
 | 
			
		||||
[package]
 | 
			
		||||
name = "rsbag_arrow"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
edition = "2018"
 | 
			
		||||
rust-version = "1.66"
 | 
			
		||||
 | 
			
		||||
[dependencies]
 | 
			
		||||
arrow2 = { version = "0.15.0", features = ["io_parquet", "io_parquet_compression"] }
 | 
			
		||||
color-eyre = "0.5.11"
 | 
			
		||||
env_logger = "0.9.0"
 | 
			
		||||
eyre = "0.6.5"
 | 
			
		||||
log = "0.4.14"
 | 
			
		||||
indicatif = { version = "0.16.2", features = ["rayon"] }
 | 
			
		||||
rayon = { version = "1.6.1" }
 | 
			
		||||
rsbag = { path = "../rsbag" }
 | 
			
		||||
nom = "7.0.0"
 | 
			
		||||
mkdirp = "1.0.0"
 | 
			
		||||
 | 
			
		||||
[target.'cfg(not(target_env = "msvc"))'.dependencies]
 | 
			
		||||
jemallocator = "0.3.2"
 | 
			
		||||
							
								
								
									
										355
									
								
								rsbag_arrow/src/arrow.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										355
									
								
								rsbag_arrow/src/arrow.rs
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,355 @@
 | 
			
		||||
use arrow2::{
 | 
			
		||||
    array::{
 | 
			
		||||
        MutableArray, MutableBinaryArray, MutableBooleanArray, MutableFixedSizeBinaryArray,
 | 
			
		||||
        MutableListArray, MutablePrimitiveArray, MutableStructArray, MutableUtf8ValuesArray,
 | 
			
		||||
        TryPush,
 | 
			
		||||
    },
 | 
			
		||||
    datatypes::{DataType, Field, TimeUnit},
 | 
			
		||||
    types::NativeType,
 | 
			
		||||
};
 | 
			
		||||
use rsbag::{
 | 
			
		||||
    layout::{FieldLayout, FieldType, MessageLayout, Multiplicity},
 | 
			
		||||
    parse::{header::fields::parse_time, Error, ErrorKind, IResult, Input},
 | 
			
		||||
    Result,
 | 
			
		||||
};
 | 
			
		||||
use eyre::bail;
 | 
			
		||||
use nom::{
 | 
			
		||||
    bytes::complete::take,
 | 
			
		||||
    combinator::map_res,
 | 
			
		||||
    multi::{length_count, length_data, many_m_n},
 | 
			
		||||
    number::complete::{
 | 
			
		||||
        le_f32, le_f64, le_i16, le_i32, le_i64, le_i8, le_u16, le_u32, le_u64, le_u8,
 | 
			
		||||
    },
 | 
			
		||||
    Parser,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
fn make_mutable_array<A: MutableArray + Default + 'static>(
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
) -> Box<dyn MutableArray> {
 | 
			
		||||
    make_mutable_array_from_values(multiplicity, A::default())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn make_mutable_array_from_values<A: MutableArray + 'static>(
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
    values: A,
 | 
			
		||||
) -> Box<dyn MutableArray> {
 | 
			
		||||
    match multiplicity {
 | 
			
		||||
        Multiplicity::Unit => Box::new(values),
 | 
			
		||||
        // TODO: cannot write fixed size list to parqeut
 | 
			
		||||
        // Multiplicity::Fixed(size) => Box::new(MutableFixedSizeListArray::new_with_field(
 | 
			
		||||
        //     values, "item", false, size,
 | 
			
		||||
        // )),
 | 
			
		||||
        Multiplicity::Fixed(_) | Multiplicity::Dynamic => Box::new(
 | 
			
		||||
            MutableListArray::<i32, A>::new_with_field(values, "item", false),
 | 
			
		||||
        ),
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn field_to_mutable_array(field: &FieldLayout) -> Box<dyn MutableArray> {
 | 
			
		||||
    // TODO: BinaryArray for I8/U8
 | 
			
		||||
    match &field.typ {
 | 
			
		||||
        FieldType::Bool => make_mutable_array::<MutableBooleanArray>(field.multiplicity),
 | 
			
		||||
        FieldType::I8(_) => make_mutable_array::<MutablePrimitiveArray<i8>>(field.multiplicity),
 | 
			
		||||
        FieldType::I16 => make_mutable_array::<MutablePrimitiveArray<i16>>(field.multiplicity),
 | 
			
		||||
        FieldType::I32 => make_mutable_array::<MutablePrimitiveArray<i32>>(field.multiplicity),
 | 
			
		||||
        FieldType::I64 => make_mutable_array::<MutablePrimitiveArray<i64>>(field.multiplicity),
 | 
			
		||||
        FieldType::U8(_) => match field.multiplicity {
 | 
			
		||||
            Multiplicity::Unit => Box::new(MutablePrimitiveArray::<u8>::new()),
 | 
			
		||||
            Multiplicity::Fixed(n) => Box::new(MutableFixedSizeBinaryArray::new(n)),
 | 
			
		||||
            Multiplicity::Dynamic => Box::new(MutableBinaryArray::<i32>::new()),
 | 
			
		||||
        },
 | 
			
		||||
        FieldType::U16 => make_mutable_array::<MutablePrimitiveArray<u16>>(field.multiplicity),
 | 
			
		||||
        FieldType::U32 => make_mutable_array::<MutablePrimitiveArray<u32>>(field.multiplicity),
 | 
			
		||||
        FieldType::U64 => make_mutable_array::<MutablePrimitiveArray<u64>>(field.multiplicity),
 | 
			
		||||
        FieldType::F32 => make_mutable_array::<MutablePrimitiveArray<f32>>(field.multiplicity),
 | 
			
		||||
        FieldType::F64 => make_mutable_array::<MutablePrimitiveArray<f64>>(field.multiplicity),
 | 
			
		||||
        FieldType::String => make_mutable_array::<MutableUtf8ValuesArray<i32>>(field.multiplicity),
 | 
			
		||||
        FieldType::Time => make_mutable_array_from_values(
 | 
			
		||||
            field.multiplicity,
 | 
			
		||||
            MutablePrimitiveArray::<i64>::from(DataType::Timestamp(
 | 
			
		||||
                TimeUnit::Nanosecond,
 | 
			
		||||
                Some("UTC".into()),
 | 
			
		||||
            )),
 | 
			
		||||
        ),
 | 
			
		||||
        FieldType::Duration => make_mutable_array_from_values(
 | 
			
		||||
            field.multiplicity,
 | 
			
		||||
            MutablePrimitiveArray::<i64>::from(DataType::Duration(TimeUnit::Nanosecond)),
 | 
			
		||||
        ),
 | 
			
		||||
        FieldType::Message(layout) => {
 | 
			
		||||
            make_mutable_array_from_values(field.multiplicity, layout_to_mutable_array(&*layout))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn layout_to_mutable_arrays(layout: &MessageLayout) -> Vec<Box<dyn MutableArray>> {
 | 
			
		||||
    layout.fields().iter().map(field_to_mutable_array).collect()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
pub fn layout_to_mutable_array(layout: &MessageLayout) -> MutableStructArray {
 | 
			
		||||
    let values = layout_to_mutable_arrays(layout);
 | 
			
		||||
    let fields = layout
 | 
			
		||||
        .fields()
 | 
			
		||||
        .iter()
 | 
			
		||||
        .zip(&values)
 | 
			
		||||
        .map(|(field, array)| Field::new(field.name.to_string(), array.data_type().clone(), false))
 | 
			
		||||
        .collect();
 | 
			
		||||
    let datatype = DataType::Struct(fields);
 | 
			
		||||
    MutableStructArray::new(datatype, values)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// TODO: more efficient primitive parsing using bytemuck
 | 
			
		||||
fn parse_push_primitive<'a, 'b: 'a, T, P>(
 | 
			
		||||
    mut parser: P,
 | 
			
		||||
    array: &'a mut dyn MutableArray,
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
    input: Input<'b>,
 | 
			
		||||
) -> IResult<'b, ()>
 | 
			
		||||
where
 | 
			
		||||
    T: NativeType,
 | 
			
		||||
    P: Parser<Input<'b>, T, Error<Input<'b>>> + 'a,
 | 
			
		||||
{
 | 
			
		||||
    match multiplicity {
 | 
			
		||||
        Multiplicity::Unit => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutablePrimitiveArray<T>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, value) = parser.parse(input)?;
 | 
			
		||||
            array.push(Some(value));
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Fixed(n) => {
 | 
			
		||||
            // TODO: FixedSizeListArray unsupported to write to parquet
 | 
			
		||||
            // let array = array
 | 
			
		||||
            //     .as_mut_any()
 | 
			
		||||
            //     .downcast_mut::<MutableFixedSizeListArray<MutablePrimitiveArray<T>>>()
 | 
			
		||||
            //     .expect("wrong array type");
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, MutablePrimitiveArray<T>>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = many_m_n(n, n, parser).parse(input)?;
 | 
			
		||||
            array.mut_values().extend_from_slice(values.as_ref());
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Dynamic => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, MutablePrimitiveArray<T>>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = length_count(le_u32, parser).parse(input)?;
 | 
			
		||||
            array.mut_values().extend_from_slice(values.as_ref());
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// TODO: more efficient primitive parsing using bytemuck
 | 
			
		||||
fn parse_push_u8<'a, 'b: 'a>(
 | 
			
		||||
    array: &'a mut dyn MutableArray,
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
    input: Input<'b>,
 | 
			
		||||
) -> IResult<'b, ()> {
 | 
			
		||||
    // TODO: copy directly without
 | 
			
		||||
    match multiplicity {
 | 
			
		||||
        Multiplicity::Unit => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutablePrimitiveArray<u8>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, value) = le_u8.parse(input)?;
 | 
			
		||||
            array.push(Some(value));
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Fixed(n) => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableFixedSizeBinaryArray>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = take(n).parse(input)?;
 | 
			
		||||
            array.push(Some(values));
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Dynamic => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableBinaryArray<i32>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = length_data(le_u32).parse(input)?;
 | 
			
		||||
            array.push(Some(values));
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn parse_push<'a, 'b: 'a, A, T, P>(
 | 
			
		||||
    mut parser: P,
 | 
			
		||||
    array: &'a mut dyn MutableArray,
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
    input: Input<'b>,
 | 
			
		||||
) -> IResult<'b, ()>
 | 
			
		||||
where
 | 
			
		||||
    P: Parser<Input<'b>, T, Error<Input<'b>>> + 'a,
 | 
			
		||||
    A: MutableArray + TryPush<T> + 'static,
 | 
			
		||||
{
 | 
			
		||||
    match multiplicity {
 | 
			
		||||
        Multiplicity::Unit => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<A>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, value) = parser.parse(input)?;
 | 
			
		||||
            array.try_push(value).expect("array push failed");
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Fixed(n) => {
 | 
			
		||||
            // TODO: FixedSizeListArray unsupported to write to parquet
 | 
			
		||||
            // let array = array
 | 
			
		||||
            //     .as_mut_any()
 | 
			
		||||
            //     .downcast_mut::<MutableFixedSizeListArray<A>>()
 | 
			
		||||
            //     .expect("wrong array type");
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, A>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = many_m_n(n, n, parser).parse(input)?;
 | 
			
		||||
            for value in values {
 | 
			
		||||
                array
 | 
			
		||||
                    .mut_values()
 | 
			
		||||
                    .try_push(value)
 | 
			
		||||
                    .expect("array push failed");
 | 
			
		||||
            }
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Dynamic => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, A>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let (rest, values) = length_count(le_u32, parser).parse(input)?;
 | 
			
		||||
            for value in values {
 | 
			
		||||
                array
 | 
			
		||||
                    .mut_values()
 | 
			
		||||
                    .try_push(value)
 | 
			
		||||
                    .expect("array push failed");
 | 
			
		||||
            }
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn parse_push_message<'a, 'b: 'a>(
 | 
			
		||||
    layout: &MessageLayout,
 | 
			
		||||
    array: &'a mut dyn MutableArray,
 | 
			
		||||
    multiplicity: Multiplicity,
 | 
			
		||||
    input: Input<'b>,
 | 
			
		||||
) -> IResult<'b, ()> {
 | 
			
		||||
    match multiplicity {
 | 
			
		||||
        Multiplicity::Unit => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableStructArray>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            message_parser(layout, array).parse(input)
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Fixed(n) => {
 | 
			
		||||
            // TODO: FixedSizeListArray unsupported to write to parquet
 | 
			
		||||
            // let array = array
 | 
			
		||||
            //     .as_mut_any()
 | 
			
		||||
            //     .downcast_mut::<MutableFixedSizeListArray<MutableStructArray>>()
 | 
			
		||||
            //     .expect("wrong array type");
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, MutableStructArray>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let parser = message_parser(layout, array.mut_values());
 | 
			
		||||
            let (rest, _) = many_m_n(n, n, parser).map(|_| ()).parse(input)?;
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
        Multiplicity::Dynamic => {
 | 
			
		||||
            let array = array
 | 
			
		||||
                .as_mut_any()
 | 
			
		||||
                .downcast_mut::<MutableListArray<i32, MutableStructArray>>()
 | 
			
		||||
                .expect("wrong array type");
 | 
			
		||||
            let parser = message_parser(layout, array.mut_values());
 | 
			
		||||
            let (rest, _) = length_count(le_u32, parser).map(|_| ()).parse(input)?;
 | 
			
		||||
            array.try_push_valid().unwrap();
 | 
			
		||||
            Ok((rest, ()))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn field_parser<'a, 'b: 'a>(
 | 
			
		||||
    field: &'a FieldLayout,
 | 
			
		||||
    array: &'a mut dyn MutableArray,
 | 
			
		||||
) -> impl Parser<Input<'b>, (), Error<Input<'b>>> + 'a {
 | 
			
		||||
    move |input| -> IResult<()> {
 | 
			
		||||
        match &field.typ {
 | 
			
		||||
            FieldType::Bool => parse_push::<MutableBooleanArray, _, _>(
 | 
			
		||||
                le_u8.map(|v| Some(v != 0)),
 | 
			
		||||
                array,
 | 
			
		||||
                field.multiplicity,
 | 
			
		||||
                input,
 | 
			
		||||
            ),
 | 
			
		||||
            FieldType::I8(_) => parse_push_primitive(le_i8, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::I16 => parse_push_primitive(le_i16, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::I32 => parse_push_primitive(le_i32, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::I64 => parse_push_primitive(le_i64, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::U8(_) => parse_push_u8(array, field.multiplicity, input),
 | 
			
		||||
            FieldType::U16 => parse_push_primitive(le_u16, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::U32 => parse_push_primitive(le_u32, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::U64 => parse_push_primitive(le_u64, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::F32 => parse_push_primitive(le_f32, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::F64 => parse_push_primitive(le_f64, array, field.multiplicity, input),
 | 
			
		||||
            FieldType::String => parse_push::<MutableUtf8ValuesArray<i32>, _, _>(
 | 
			
		||||
                map_res(length_data(le_u32), |s| {
 | 
			
		||||
                    String::from_utf8(Vec::from(s)).map_err(ErrorKind::from)
 | 
			
		||||
                }),
 | 
			
		||||
                array,
 | 
			
		||||
                field.multiplicity,
 | 
			
		||||
                input,
 | 
			
		||||
            ),
 | 
			
		||||
            FieldType::Time => parse_push_primitive(
 | 
			
		||||
                parse_time.map(|time| time.nanos()),
 | 
			
		||||
                array,
 | 
			
		||||
                field.multiplicity,
 | 
			
		||||
                input,
 | 
			
		||||
            ),
 | 
			
		||||
            FieldType::Duration => todo!(),
 | 
			
		||||
            FieldType::Message(layout) => {
 | 
			
		||||
                parse_push_message(layout.as_ref(), array, field.multiplicity, input)
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn message_parser<'a, 'b: 'a>(
 | 
			
		||||
    layout: &'a MessageLayout,
 | 
			
		||||
    array: &'a mut MutableStructArray,
 | 
			
		||||
) -> impl Parser<Input<'b>, (), Error<Input<'b>>> + 'a {
 | 
			
		||||
    move |mut input| -> IResult<_> {
 | 
			
		||||
        for (field, values) in layout.fields().iter().zip(array.mut_values()) {
 | 
			
		||||
            let (rest, _) = field_parser(field, &mut **values).parse(input)?;
 | 
			
		||||
            input = rest;
 | 
			
		||||
        }
 | 
			
		||||
        Ok((input, ()))
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
pub fn parse_message(
 | 
			
		||||
    layout: &MessageLayout,
 | 
			
		||||
    input: &[u8],
 | 
			
		||||
    array: &mut MutableStructArray,
 | 
			
		||||
) -> Result<()> {
 | 
			
		||||
    match message_parser(layout, array).parse(input) {
 | 
			
		||||
        Ok((&[], message)) => Ok(message),
 | 
			
		||||
        Ok(_) => bail!("extra data after message"),
 | 
			
		||||
        Err(nom::Err::Incomplete(_)) => unreachable!(),
 | 
			
		||||
        Err(nom::Err::Error(e) | nom::Err::Failure(e)) => Err(e.into_owned().into()),
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										421
									
								
								rsbag_arrow/src/main.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										421
									
								
								rsbag_arrow/src/main.rs
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,421 @@
 | 
			
		||||
use std::{
 | 
			
		||||
    collections::{BTreeMap, BTreeSet, HashMap, VecDeque},
 | 
			
		||||
    env::args,
 | 
			
		||||
    fs::File,
 | 
			
		||||
    path::Path,
 | 
			
		||||
    sync::{Arc, Mutex},
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
use arrow2::{
 | 
			
		||||
    array::{Array, MutableArray, MutableStructArray},
 | 
			
		||||
    chunk::Chunk,
 | 
			
		||||
    datatypes::{DataType, Field, PhysicalType, Schema},
 | 
			
		||||
    io::parquet::write::{self as pq_write, row_group_iter, CompressedPage, ZstdLevel},
 | 
			
		||||
    types::PrimitiveType,
 | 
			
		||||
};
 | 
			
		||||
use indicatif::ParallelProgressIterator;
 | 
			
		||||
use log::{info, trace};
 | 
			
		||||
use rayon::{
 | 
			
		||||
    iter::ParallelIterator,
 | 
			
		||||
    prelude::{IndexedParallelIterator, ParallelBridge},
 | 
			
		||||
};
 | 
			
		||||
use rsbag::{
 | 
			
		||||
    chunk::{read_chunk_data_at, ChunkMessageIterator, MessageData},
 | 
			
		||||
    index::BagIndex,
 | 
			
		||||
    layout::MessageLayout,
 | 
			
		||||
    message::MessageLayouts,
 | 
			
		||||
    Bag, Result,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#[cfg(not(target_env = "msvc"))]
 | 
			
		||||
use jemallocator::Jemalloc;
 | 
			
		||||
 | 
			
		||||
#[cfg(not(target_env = "msvc"))]
 | 
			
		||||
#[global_allocator]
 | 
			
		||||
static GLOBAL: Jemalloc = Jemalloc;
 | 
			
		||||
 | 
			
		||||
mod arrow;
 | 
			
		||||
 | 
			
		||||
struct Bla {
 | 
			
		||||
    columns: VecDeque<CompressedPage>,
 | 
			
		||||
    current: Option<CompressedPage>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl Bla {
 | 
			
		||||
    pub fn new(columns: VecDeque<CompressedPage>) -> Self {
 | 
			
		||||
        Self {
 | 
			
		||||
            columns,
 | 
			
		||||
            current: None,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl pq_write::FallibleStreamingIterator for Bla {
 | 
			
		||||
    type Item = CompressedPage;
 | 
			
		||||
    type Error = arrow2::error::Error;
 | 
			
		||||
 | 
			
		||||
    fn advance(&mut self) -> Result<(), arrow2::error::Error> {
 | 
			
		||||
        self.current = self.columns.pop_front();
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn get(&self) -> Option<&Self::Item> {
 | 
			
		||||
        self.current.as_ref()
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type ChunkSet = BTreeSet<usize>;
 | 
			
		||||
 | 
			
		||||
struct ConnToArrow {
 | 
			
		||||
    layout: MessageLayout,
 | 
			
		||||
    array: MutableStructArray,
 | 
			
		||||
    expected_chunks: ChunkSet,
 | 
			
		||||
    /// Messages waiting to be inserted in array in order of chunk
 | 
			
		||||
    pending_messages: BTreeMap<usize, Vec<MessageData>>,
 | 
			
		||||
    estimated_size: usize,
 | 
			
		||||
    ready_chunks: VecDeque<Chunk<Box<dyn Array>>>,
 | 
			
		||||
    max_chunk_size: usize,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl ConnToArrow {
 | 
			
		||||
    fn try_new(layout: MessageLayout, expected_chunks: ChunkSet) -> Result<Self> {
 | 
			
		||||
        let array = arrow::layout_to_mutable_array(&layout);
 | 
			
		||||
 | 
			
		||||
        Ok(Self {
 | 
			
		||||
            layout: layout,
 | 
			
		||||
            array: array,
 | 
			
		||||
            expected_chunks,
 | 
			
		||||
            pending_messages: Default::default(),
 | 
			
		||||
            estimated_size: 0,
 | 
			
		||||
            ready_chunks: Default::default(),
 | 
			
		||||
            max_chunk_size: 128 * 1024 * 1024,
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn add_message(&mut self, chunk_idx: usize, message: MessageData) {
 | 
			
		||||
        self.pending_messages
 | 
			
		||||
            .entry(chunk_idx)
 | 
			
		||||
            .or_default()
 | 
			
		||||
            .push(message);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn chunk_complete(&mut self, chunk_idx: usize) -> Result<()> {
 | 
			
		||||
        assert!(self.expected_chunks.remove(&chunk_idx));
 | 
			
		||||
        // Flush all pending messages from chunks before the last expected chunk.
 | 
			
		||||
        // i.e. wait to flush a chunk until all previous chunks have been recieved.
 | 
			
		||||
        let latest_expected_chunk = self.expected_chunks.first().copied();
 | 
			
		||||
        let mut flushed = 0;
 | 
			
		||||
        for (&chunk_idx, messages) in self.pending_messages.iter_mut() {
 | 
			
		||||
            if let Some(latest_expected_chunk) = latest_expected_chunk {
 | 
			
		||||
                if chunk_idx >= latest_expected_chunk {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            flushed += 1;
 | 
			
		||||
            for message in messages.drain(..) {
 | 
			
		||||
                arrow::parse_message(&self.layout, message.data.as_ref(), &mut self.array)?;
 | 
			
		||||
                self.estimated_size += message.data.len();
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        trace!(
 | 
			
		||||
            "Chunk {} complete, with {} pending chunks (latest {:?}). Flushed {} chunks",
 | 
			
		||||
            chunk_idx,
 | 
			
		||||
            self.pending_messages.len(),
 | 
			
		||||
            latest_expected_chunk,
 | 
			
		||||
            flushed
 | 
			
		||||
        );
 | 
			
		||||
        // Remove all empty chunks
 | 
			
		||||
        self.pending_messages.retain(|_, v| !v.is_empty());
 | 
			
		||||
 | 
			
		||||
        if self.estimated_size > self.max_chunk_size {
 | 
			
		||||
            trace!(
 | 
			
		||||
                "connection data size exceeds {}MiB, flushing",
 | 
			
		||||
                self.max_chunk_size / (1024 * 1024)
 | 
			
		||||
            );
 | 
			
		||||
            self.flush();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn flush(&mut self) {
 | 
			
		||||
        let chunk = Chunk::new(
 | 
			
		||||
            std::mem::take(self.array.mut_values())
 | 
			
		||||
                .into_iter()
 | 
			
		||||
                .map(|mut v| v.as_box())
 | 
			
		||||
                .collect(),
 | 
			
		||||
        );
 | 
			
		||||
        self.ready_chunks.push_back(chunk);
 | 
			
		||||
 | 
			
		||||
        self.array = arrow::layout_to_mutable_array(&self.layout);
 | 
			
		||||
        self.estimated_size = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn take_chunk(&mut self) -> Option<Chunk<Box<dyn Array>>> {
 | 
			
		||||
        self.ready_chunks.pop_front()
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct ParquetWriter {
 | 
			
		||||
    encodings: Vec<Vec<pq_write::Encoding>>,
 | 
			
		||||
    file_writer: pq_write::FileWriter<File>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl ParquetWriter {
 | 
			
		||||
    fn try_new(fields: &[Field], file: File) -> Result<Self> {
 | 
			
		||||
        let writer_options = pq_write::WriteOptions {
 | 
			
		||||
            write_statistics: true,
 | 
			
		||||
            version: pq_write::Version::V2,
 | 
			
		||||
            compression: pq_write::CompressionOptions::Zstd(Some(ZstdLevel::try_new(10).unwrap())),
 | 
			
		||||
            // compression: pq_write::CompressionOptions::Zstd(None),
 | 
			
		||||
            // compression: pq_write::CompressionOptions::Snappy,
 | 
			
		||||
            // compression: pq_write::CompressionOptions::Uncompressed,
 | 
			
		||||
            data_pagesize_limit: Some(128 * 1024 * 1024),
 | 
			
		||||
            // data_pagesize_limit: Some(16 * 1024 * 1024),
 | 
			
		||||
        };
 | 
			
		||||
        let schema = Schema {
 | 
			
		||||
            fields: fields.to_vec(),
 | 
			
		||||
            metadata: Default::default(),
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        let encodings: Vec<_> = fields
 | 
			
		||||
            .iter()
 | 
			
		||||
            .map(|f| {
 | 
			
		||||
                pq_write::transverse(&f.data_type, |dtype| match dtype.to_physical_type() {
 | 
			
		||||
                    PhysicalType::Primitive(pt) => match pt {
 | 
			
		||||
                        PrimitiveType::Float16
 | 
			
		||||
                        | PrimitiveType::Float32
 | 
			
		||||
                        | PrimitiveType::Float64 => pq_write::Encoding::Plain,
 | 
			
		||||
                        _ => pq_write::Encoding::DeltaBinaryPacked,
 | 
			
		||||
                    },
 | 
			
		||||
                    PhysicalType::Binary => pq_write::Encoding::DeltaLengthByteArray,
 | 
			
		||||
                    _ => pq_write::Encoding::Plain,
 | 
			
		||||
                })
 | 
			
		||||
            })
 | 
			
		||||
            .collect();
 | 
			
		||||
 | 
			
		||||
        let file_writer = pq_write::FileWriter::try_new(file, schema, writer_options)?;
 | 
			
		||||
        Ok(Self {
 | 
			
		||||
            encodings,
 | 
			
		||||
            file_writer,
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn write_chunk(&mut self, chunk: Chunk<Box<dyn Array>>) -> Result<()> {
 | 
			
		||||
        let options = self.file_writer.options();
 | 
			
		||||
        // let columns = chunk
 | 
			
		||||
        //     .columns()
 | 
			
		||||
        //     .par_iter()
 | 
			
		||||
        //     .zip(self.file_writer.parquet_schema().fields().to_vec())
 | 
			
		||||
        //     .zip(self.encodings.par_iter())
 | 
			
		||||
        //     .flat_map(move |((array, type_), encoding)| {
 | 
			
		||||
        //         let encoded_columns =
 | 
			
		||||
        //             pq_write::array_to_columns(array, type_, options, encoding).unwrap();
 | 
			
		||||
        //         encoded_columns
 | 
			
		||||
        //             .into_iter()
 | 
			
		||||
        //             .map(|encoded_pages| {
 | 
			
		||||
        //                 let encoded_pages = pq_write::DynIter::new(
 | 
			
		||||
        //                     encoded_pages
 | 
			
		||||
        //                         .into_iter()
 | 
			
		||||
        //                         .map(|x| x.map_err(arrow2::error::Error::from_external_error)),
 | 
			
		||||
        //                 );
 | 
			
		||||
        //                 encoded_pages
 | 
			
		||||
        //                     .map(|page| {
 | 
			
		||||
        //                         pq_write::compress(page?, vec![], options.compression)
 | 
			
		||||
        //                             .map_err(|x| x.into())
 | 
			
		||||
        //                     })
 | 
			
		||||
        //                     .collect::<Result<VecDeque<_>>>()
 | 
			
		||||
        //             })
 | 
			
		||||
        //             .collect::<Vec<_>>()
 | 
			
		||||
        //     })
 | 
			
		||||
        //     .collect::<Result<Vec<VecDeque<pq_write::CompressedPage>>>>()?;
 | 
			
		||||
 | 
			
		||||
        // let rgi = pq_write::DynIter::new(
 | 
			
		||||
        //     columns
 | 
			
		||||
        //         .into_iter()
 | 
			
		||||
        //         .map(|column| Ok(pq_write::DynStreamingIterator::new(Bla::new(column)))),
 | 
			
		||||
        // );
 | 
			
		||||
 | 
			
		||||
        let rgi = row_group_iter(
 | 
			
		||||
            chunk,
 | 
			
		||||
            self.encodings.clone(),
 | 
			
		||||
            self.file_writer.parquet_schema().fields().to_vec(),
 | 
			
		||||
            options,
 | 
			
		||||
        );
 | 
			
		||||
        self.file_writer.write(rgi)?;
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn end(&mut self) -> Result<()> {
 | 
			
		||||
        // TODO: footer metadata?
 | 
			
		||||
        self.file_writer.end(None)?;
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct ConnToParquet {
 | 
			
		||||
    to_arrow: Mutex<ConnToArrow>,
 | 
			
		||||
    parquet_writer: Mutex<ParquetWriter>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl ConnToParquet {
 | 
			
		||||
    fn try_new(layout: MessageLayout, expected_chunks: ChunkSet, file: File) -> Result<Self> {
 | 
			
		||||
        let to_arrow = ConnToArrow::try_new(layout, expected_chunks)?;
 | 
			
		||||
 | 
			
		||||
        let DataType::Struct(fields) = to_arrow.array.data_type() else { unreachable!() };
 | 
			
		||||
        let parquet_writer = ParquetWriter::try_new(fields, file)?;
 | 
			
		||||
 | 
			
		||||
        Ok(Self {
 | 
			
		||||
            to_arrow: to_arrow.into(),
 | 
			
		||||
            parquet_writer: parquet_writer.into(),
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn add_message(&self, chunk_idx: usize, message: MessageData) {
 | 
			
		||||
        self.to_arrow
 | 
			
		||||
            .lock()
 | 
			
		||||
            .unwrap()
 | 
			
		||||
            .add_message(chunk_idx, message);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn chunk_complete(&self, chunk_idx: usize) -> Result<()> {
 | 
			
		||||
        self.to_arrow.lock().unwrap().chunk_complete(chunk_idx)?;
 | 
			
		||||
 | 
			
		||||
        self.write_ready()?;
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn write_ready(&self) -> Result<()> {
 | 
			
		||||
        loop {
 | 
			
		||||
            let Some(chunk) = ({
 | 
			
		||||
                let mut to_arrow = self.to_arrow.lock().unwrap();
 | 
			
		||||
                to_arrow.take_chunk()
 | 
			
		||||
            }) else { break };
 | 
			
		||||
 | 
			
		||||
            self.parquet_writer.lock().unwrap().write_chunk(chunk)?;
 | 
			
		||||
        }
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn end(&self) -> Result<()> {
 | 
			
		||||
        self.to_arrow.lock().unwrap().flush();
 | 
			
		||||
        self.write_ready()?;
 | 
			
		||||
        self.parquet_writer.lock().unwrap().end()?;
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct BagToParquet {
 | 
			
		||||
    connections: HashMap<u32, ConnToParquet>,
 | 
			
		||||
    index: BagIndex,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl BagToParquet {
 | 
			
		||||
    fn try_new(index: &BagIndex, layouts: &MessageLayouts, base_path: &Path) -> Result<Self> {
 | 
			
		||||
        let mut expected_chunks_per_conn: HashMap<u32, ChunkSet> = Default::default();
 | 
			
		||||
        for (chunk_idx, chunk) in index.chunks.iter().enumerate() {
 | 
			
		||||
            for conn in &chunk.connections {
 | 
			
		||||
                let expected_chunks = expected_chunks_per_conn.entry(conn.conn_id).or_default();
 | 
			
		||||
                expected_chunks.insert(chunk_idx);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        let connections: HashMap<_, _> = layouts
 | 
			
		||||
            .conns
 | 
			
		||||
            .iter()
 | 
			
		||||
            .filter(|(_, conn)| {
 | 
			
		||||
                !conn.info.topic.ends_with("image_rect_raw")
 | 
			
		||||
                    || conn.info.topic.starts_with("/camera3")
 | 
			
		||||
            })
 | 
			
		||||
            .map(|(id, conn)| -> Result<_> {
 | 
			
		||||
                let expected_chunks = std::mem::take(expected_chunks_per_conn.get_mut(id).unwrap());
 | 
			
		||||
                let conn_path = base_path.join(format!("{}.parquet", &conn.info.topic[1..]));
 | 
			
		||||
                if let Some(parent) = conn_path.parent() {
 | 
			
		||||
                    mkdirp::mkdirp(parent)?;
 | 
			
		||||
                }
 | 
			
		||||
                let conn_file = File::create(conn_path)?;
 | 
			
		||||
                let ctp = ConnToParquet::try_new(conn.layout.clone(), expected_chunks, conn_file)?;
 | 
			
		||||
                Ok((*id, ctp))
 | 
			
		||||
            })
 | 
			
		||||
            .collect::<Result<_>>()?;
 | 
			
		||||
        Ok(Self {
 | 
			
		||||
            connections,
 | 
			
		||||
            index: index.clone(),
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn add_message(&self, chunk_idx: usize, message: MessageData) {
 | 
			
		||||
        if let Some(ctp) = self.connections.get(&message.header.conn_id) {
 | 
			
		||||
            ctp.add_message(chunk_idx, message)
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn chunk_complete(&self, chunk_idx: usize) -> Result<()> {
 | 
			
		||||
        let chunk_info = self.index.chunks.get(chunk_idx).unwrap();
 | 
			
		||||
        for conn in &chunk_info.connections {
 | 
			
		||||
            if let Some(ctp) = self.connections.get(&conn.conn_id) {
 | 
			
		||||
                ctp.chunk_complete(chunk_idx)?;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn end(&self) -> Result<()> {
 | 
			
		||||
        self.connections
 | 
			
		||||
            .values()
 | 
			
		||||
            .par_bridge()
 | 
			
		||||
            .try_for_each(|ctp| ctp.end())?;
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn main() -> Result<()> {
 | 
			
		||||
    color_eyre::install()?;
 | 
			
		||||
    env_logger::init();
 | 
			
		||||
 | 
			
		||||
    let args: Vec<_> = args().collect();
 | 
			
		||||
    if args.len() != 3 {
 | 
			
		||||
        eprintln!("Usage: {} <bag path> <parquet file>", args[0]);
 | 
			
		||||
        return Ok(());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let bag_path = &args[1];
 | 
			
		||||
    let arrow_path = &args[2];
 | 
			
		||||
    let mut bag = Bag::open(bag_path)?;
 | 
			
		||||
 | 
			
		||||
    let layouts = bag.compute_message_layouts()?;
 | 
			
		||||
 | 
			
		||||
    // let info = bag.compute_info()?;
 | 
			
		||||
    // let total_messages: u64 = info.per_connection.values().map(|con| con.count).sum();
 | 
			
		||||
    // info!("exporting {} messages", total_messages);
 | 
			
		||||
 | 
			
		||||
    let num_chunks = bag.index().chunks.len();
 | 
			
		||||
    let btp = Arc::new(BagToParquet::try_new(
 | 
			
		||||
        bag.index(),
 | 
			
		||||
        &layouts,
 | 
			
		||||
        Path::new(arrow_path),
 | 
			
		||||
    )?);
 | 
			
		||||
    bag.index()
 | 
			
		||||
        .chunks
 | 
			
		||||
        .iter()
 | 
			
		||||
        .enumerate()
 | 
			
		||||
        // Force using par_bridge as into_par_iter distributes chunks throughout the file
 | 
			
		||||
        .par_bridge()
 | 
			
		||||
        .progress_count(num_chunks as u64)
 | 
			
		||||
        .map_with(bag.reader().clone(), |reader, (idx, chunk)| {
 | 
			
		||||
            (idx, read_chunk_data_at(reader, chunk.pos))
 | 
			
		||||
        })
 | 
			
		||||
        .for_each_with(btp.clone(), |btp, (chunk_idx, chunk)| {
 | 
			
		||||
            let it = ChunkMessageIterator::new(chunk.unwrap());
 | 
			
		||||
            for msg in it {
 | 
			
		||||
                let msg = msg.unwrap();
 | 
			
		||||
                btp.add_message(chunk_idx, msg);
 | 
			
		||||
            }
 | 
			
		||||
            btp.chunk_complete(chunk_idx).unwrap();
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
    btp.end()?;
 | 
			
		||||
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										19
									
								
								rsbagpy/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								rsbagpy/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,19 @@
 | 
			
		||||
[package]
 | 
			
		||||
name = "rsbagpy"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
edition = "2018"
 | 
			
		||||
 | 
			
		||||
[lib]
 | 
			
		||||
# The name of the native library. This is the name which will be used in Python to import the
 | 
			
		||||
# library (i.e. `import string_sum`). If you change this, you must also change the name of the
 | 
			
		||||
# `#[pymodule]` in `src/lib.rs`.
 | 
			
		||||
name = "rsbag"
 | 
			
		||||
# "cdylib" is necessary to produce a shared library for Python to import from.
 | 
			
		||||
#
 | 
			
		||||
# Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able
 | 
			
		||||
# to `use string_sum;` unless the "rlib" or "lib" crate type is also included, e.g.:
 | 
			
		||||
# crate-type = ["cdylib", "rlib"]
 | 
			
		||||
crate-type = ["cdylib"]
 | 
			
		||||
 | 
			
		||||
[dependencies]
 | 
			
		||||
pyo3 = { version = "0.16.4", features = ["extension-module"] }
 | 
			
		||||
							
								
								
									
										30
									
								
								rsbagpy/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								rsbagpy/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,30 @@
 | 
			
		||||
use std::path::PathBuf;
 | 
			
		||||
 | 
			
		||||
use pyo3::prelude::*;
 | 
			
		||||
 | 
			
		||||
#[pyclass]
 | 
			
		||||
struct Message {}
 | 
			
		||||
 | 
			
		||||
#[pyclass]
 | 
			
		||||
struct Bag {}
 | 
			
		||||
 | 
			
		||||
#[pymethods]
 | 
			
		||||
impl Bag {
 | 
			
		||||
    #[new]
 | 
			
		||||
    fn new(path: PathBuf) -> Self {
 | 
			
		||||
        Self {}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn read(&mut self) -> Option<Message> {
 | 
			
		||||
        Some(Message {})
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// A Python module implemented in Rust. The name of this function must match
 | 
			
		||||
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
 | 
			
		||||
/// import the module.
 | 
			
		||||
#[pymodule]
 | 
			
		||||
fn rsbag(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
 | 
			
		||||
    m.add_class::<Bag>()?;
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user