Skip to content

Commit

Permalink
Generalize segments for mutable indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
Avarel committed Nov 22, 2023
1 parent b856ce8 commit c75c86a
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 122 deletions.
46 changes: 23 additions & 23 deletions crates/core/src/buf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ impl BufferRepr {
len,
segments,
} => {
let range = {
let seg_id = seg_id as u64;
(seg_id * crate::SEG_SIZE)..((seg_id + 1) * crate::SEG_SIZE).min(*len)
};
let range = Segment::data_range_of_id(seg_id);
let range = range.start..range.end.min(*len);
segments
.get_or_insert(seg_id, || Arc::new(Segment::map_file(seg_id, range, file)))
.get_or_insert(seg_id, || {
Arc::new(Segment::map_file(seg_id, range, file).unwrap())
})
.clone()
}
BufferRepr::Stream {
Expand Down Expand Up @@ -228,17 +228,17 @@ where
}

/// Retrieves a line of text from the buffer based on the given line number.
///
///
///
///
/// # Arguments
///
///
/// * `line_number` - The line number to retrieve.
///
///
///
/// # Arguments
///
///
/// * `line_number` - The line number to retrieve.
///
///
/// # Panics
///
/// This function will panic if the `line_number` is greater than the total number
Expand All @@ -252,14 +252,14 @@ where

let data_start = self.index.data_of_line(line_number).unwrap();
let data_end = self.index.data_of_line(line_number + 1).unwrap();
let seg_start = (data_start / crate::SEG_SIZE) as usize;
let seg_end = (data_end / crate::SEG_SIZE) as usize;
let seg_start = Segment::id_of_data(data_start);
let seg_end = Segment::id_of_data(data_end);

if seg_start == seg_end {
// The data is in a single segment
let seg = self.repr.fetch(seg_start);
let (start, end) = seg.translate_inner_data_range(data_start, data_end);
seg.get_line(start, end)
let range = seg.translate_inner_data_range(data_start, data_end);
seg.get_line(range)
} else {
debug_assert!(seg_start < seg_end);
// The data may cross several segments, so we must piece together
Expand Down Expand Up @@ -356,8 +356,8 @@ where
let curr_line_data_start = self.index.data_of_line(curr_line).unwrap();
let curr_line_data_end = self.index.data_of_line(curr_line + 1).unwrap();

let curr_line_seg_start = (curr_line_data_start / crate::SEG_SIZE) as usize;
let curr_line_seg_end = (curr_line_data_end / crate::SEG_SIZE) as usize;
let curr_line_seg_start = Segment::id_of_data(curr_line_data_start);
let curr_line_seg_end = Segment::id_of_data(curr_line_data_end);

if curr_line_seg_end != curr_line_seg_start {
self.imm_buf.clear();
Expand All @@ -380,8 +380,8 @@ where
self.line_range.start += 1;
return Some((&self.index, curr_line_data_start, &self.imm_buf));
} else {
let curr_seg_data_start = curr_line_seg_start as u64 * crate::SEG_SIZE;
let curr_seg_data_end = curr_seg_data_start + crate::SEG_SIZE;
let curr_seg_data_start = curr_line_seg_start as u64 * Segment::MAX_SIZE;
let curr_seg_data_end = curr_seg_data_start + Segment::MAX_SIZE;

let line_end = self
.index
Expand All @@ -391,10 +391,10 @@ where

// this line should not cross multiple segments, else we would have caught in the first case
let segment = self.repr.fetch(curr_line_seg_start);
let (start, end) =
let range =
segment.translate_inner_data_range(curr_line_data_start, line_end_data_start);
assert!(line_end_data_start - curr_seg_data_start <= crate::SEG_SIZE);
assert!(end <= crate::SEG_SIZE);
assert!(line_end_data_start - curr_seg_data_start <= Segment::MAX_SIZE);
assert!(range.end <= Segment::MAX_SIZE);

self.line_range.start = line_end;
let segment = self.imm_seg.insert(segment);
Expand All @@ -403,7 +403,7 @@ where
return Some((
&self.index,
curr_line_data_start,
&segment[start as usize..end as usize],
&segment[range.start as usize..range.end as usize],
));
}
}
Expand Down
139 changes: 100 additions & 39 deletions crates/core/src/buf/segment.rs
Original file line number Diff line number Diff line change
@@ -1,66 +1,125 @@
use crate::Result;
use memmap2::{Mmap, MmapMut};
use std::{borrow::Cow, ops::Range, ptr::NonNull, sync::Arc};
use crate::Mmappable;

pub struct Segment {
#[cfg(unix)]
pub(crate) use std::os::fd::AsRawFd as Mmappable;
#[cfg(windows)]
pub(crate) use std::os::windows::io::AsRawHandle as Mmappable;

pub struct SegmentRaw<Buf> {
id: usize,
start: u64,
data: memmap2::Mmap,
range: Range<u64>,
data: Buf,
}

impl Segment {
pub(crate) fn map_file<F: Mmappable>(id: usize, range: Range<u64>, file: &F) -> Self {
let data = unsafe {
memmap2::MmapOptions::new()
.offset(range.start)
.len((range.end - range.start) as usize)
.map(file).expect("mmap should succeed")
};
#[cfg(unix)]
data.advise(memmap2::Advice::WillNeed).ok();
Self::new(id, range.start, data)
}
pub type SegmentMut = SegmentRaw<MmapMut>;
pub type Segment = SegmentRaw<Mmap>;

pub(crate) fn new(id: usize, start: u64, data: memmap2::Mmap) -> Self {
Self {
id,
data,
start,
}
}
impl<Buf> SegmentRaw<Buf>
where
Buf: AsRef<[u8]>,
{
pub const MAX_SIZE: u64 = 1 << 20;

pub fn id(&self) -> usize {
self.id
}

pub fn as_slice(&self) -> &[u8] {
&self
pub fn start(&self) -> u64 {
self.range.start
}

pub fn translate_inner_data_index(&self, start: u64) -> u64 {
start - self.start
debug_assert!(self.range.start <= start);
// TODO: make this better... i don't like that its <=
// but technically its fine as long as start
// is the end of the buffer
debug_assert!(start <= self.range.end);
start - self.range.start
}

pub fn translate_inner_data_range(&self, start: u64, end: u64) -> (u64, u64) {
(self.translate_inner_data_index(start), self.translate_inner_data_index(end))
pub fn translate_inner_data_range(&self, start: u64, end: u64) -> Range<u64> {
self.translate_inner_data_index(start)..self.translate_inner_data_index(end)
}

pub fn get_line(self: &Arc<Self>, start: u64, end: u64) -> SegStr {
let data = &self.data[start as usize..end as usize];
// Safety: The length is computed by a (assumed to be correct)
// index. It is undefined behavior if the file changes
// in a non-appending way after the index is created.
SegStr::new(self.clone(), data)
pub fn id_of_data(start: u64) -> usize {
(start / Self::MAX_SIZE) as usize
}

pub fn data_range_of_id(id: usize) -> Range<u64> {
let start = id as u64 * Self::MAX_SIZE;
start..start + Self::MAX_SIZE
}
}

impl std::ops::Deref for Segment {
impl<Buf> std::ops::Deref for SegmentRaw<Buf>
where
Buf: std::ops::Deref<Target = [u8]>,
{
type Target = [u8];

fn deref(&self) -> &Self::Target {
&self.data
}
}

impl<Buf> std::ops::DerefMut for SegmentRaw<Buf>
where
Buf: std::ops::DerefMut<Target = [u8]>,
{
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.data
}
}

impl SegmentMut {
pub(crate) fn new(id: usize, start: u64) -> Result<Self> {
let data = memmap2::MmapOptions::new()
.len(Self::MAX_SIZE as usize)
.map_anon()?;
#[cfg(unix)]
data.advise(memmap2::Advice::Sequential)?;
Ok(Self { id, data, range: start..start + Self::MAX_SIZE })
}

pub fn into_read_only(self) -> Result<Segment> {
Ok(Segment {
id: self.id,
data: self.data.make_read_only()?,
range: self.range,
})
}
}

impl Segment {
pub(crate) fn map_file<F: Mmappable>(id: usize, range: Range<u64>, file: &F) -> Result<Self> {
let size = range.end - range.start;
debug_assert!(size <= Self::MAX_SIZE);
let data = unsafe {
memmap2::MmapOptions::new()
.offset(range.start)
.len(size as usize)
.map(file)?
};
#[cfg(unix)]
data.advise(memmap2::Advice::WillNeed)?;
Ok(Self {
id,
data,
range
})
}

pub fn get_line(self: &Arc<Self>, range: Range<u64>) -> SegStr {
let data = &self.data[range.start as usize..range.end as usize];
// Safety: The length is computed by a (assumed to be correct)
// index. It is undefined behavior if the file changes
// in a non-appending way after the index is created.
SegStr::new(self.clone(), data)
}
}

/// Line string that comes from a [Segment].
///
/// If the [SegStr] borrows from the segment, the segment will not be dropped until
Expand Down Expand Up @@ -88,7 +147,7 @@ impl SegStr {
/// is invalid utf-8, it will be converted into an owned [String] using `String::from_utf8_lossy`.
///
/// # Safety
///
///
/// 1. The provided slice must point to data that lives inside the ref-counted [Segment].
/// 2. The length must encompass a valid range of data inside the [Segment].
fn new<'origin>(origin: Arc<Segment>, data: &'origin [u8]) -> Self {
Expand All @@ -113,9 +172,11 @@ impl SegStr {
pub fn as_bytes(&self) -> &[u8] {
// Safety: We have already checked in the constructor.
match &self.0 {
SegStrRepr::Borrowed { _ref: _pin, ptr, len } => unsafe {
std::slice::from_raw_parts(ptr.as_ptr(), *len)
},
SegStrRepr::Borrowed {
_ref: _pin,
ptr,
len,
} => unsafe { std::slice::from_raw_parts(ptr.as_ptr(), *len) },
SegStrRepr::Owned(s) => s.as_bytes(),
}
}
Expand Down
Loading

0 comments on commit c75c86a

Please sign in to comment.