references.bib

Automatically generated by Mendeley Desktop 1.19.8
Any changes to this file will be lost if it is regenerated by Mendeley.

BibTeX export options can be customized via Options -> BibTeX in Mendeley Desktop

@article{Damme2019,
abstract = {Lightweight integer compression algorithms are frequently applied in in-memory database systems to tackle the growing gap between processor speed and main memory bandwidth. In recent years, the vectorization of basic techniques such as delta coding and null suppression has considerably enlarged the corpus of available algorithms. As a result, today there is a large number of algorithms to choose from, while different algorithms are tailored to different data characteristics. However, a comparative evaluation of these algorithms with different data and hardware characteristics has never been sufficiently conducted in the literature. To close this gap, we conducted an exhaustive experimental survey by evaluating several state-of-the-art lightweight integer compression algorithms as well as cascades of basic techniques. We systematically investigated the influence of data as well as hardware properties on the performance and the compression rates. The evaluated algorithms are based on publicly available implementations as well as our own vectorized reimplementations. We summarize our experimental findings leading to several new insights and to the conclusion that there is no single-best algorithm. Moreover, in this article, we also introduce and evaluate a novel cost model for the selection of a suitable lightweight integer compression algorithm for a given dataset.},
annote = {- cost model},
author = {Damme, Patrick and Ungeth{\"{u}}m, Annett and Hildebrandt, Juliana and Habich, Dirk and Lehner, Wolfgang},
doi = {10.1145/3323991},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/DammePatrick et al. - 2019 - From a Comprehensive Experimental Survey to a Cost-based Selection Strategy for Lightweight Integer Compres.pdf:pdf},
issn = {15574644},
journal = {ACM Transactions on Database Systems},
keywords = {Compression algorithm selection,Cost modeling,Experiment and analysis,Lightweight data compression,SIMD,Vectorization},
month = {jun},
number = {3},
publisher = {ACM PUB27 New York, NY, USA},
title = {{From a Comprehensive Experimental Survey to a Cost-based selection strategy for lightweight integer compression algorithms}},
url = {https://dl.acm.org/doi/abs/10.1145/3323991},
volume = {44},
year = {2019}
}
@inproceedings{Damme2017,
abstract = {Lightweight data compression algorithms are frequently applied in in-memory database systems to tackle the growing gap between processor speed and main memory bandwidth. In recent years, the vectorization of basic techniques such as delta coding and null suppression has considerably enlarged the corpus of available algorithms. As a result, today there is a large number of algorithms to choose from, while different algorithms are tailored to different data characteristics. However, a comparative evaluation of these algorithms under different data characteristics has never been sufficiently conducted in the literature. To close this gap, we conducted an exhaustive experimental survey by evaluating several state-of-the-art compression algorithms as well as cascades of basic techniques. We systematically investigated the influence of the data properties on the performance and the compression rates. The evaluated algorithms are based on publicly available implementations as well as our own vectorized reimplementations. We summarize our experimental findings leading to several new insights and to the conclusion, that there is no single-best algorithm.},
annote = {- large variety of lightweight algorithms
- no single best algorithm
- compromise between performance and compression rate is necessary},
author = {Damme, Patrick and Habich, Dirk and Hildebrandt, Juliana and Lehner, Wolfgang},
booktitle = {Advances in Database Technology - EDBT},
doi = {10.5441/002/edbt.2017.08},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Damme et al. - Unknown - Lightweight Data Compression Algorithms An Experimental Survey Experiments and Analyses.pdf:pdf},
isbn = {9783893180738},
issn = {23672005},
pages = {72--83},
title = {{Lightweight data compression algorithms: An experimental survey experiments and analyses}},
volume = {2017-March},
year = {2017}
}
@inproceedings{Abadi2006,
abstract = {Column-oriented database system architectures invite a re-evaluation of how and when data in databases is compressed. Storing data in a column-oriented fashion greatly increases the similarity of adjacent records on disk and thus opportunities for compression. The ability to compress many adjacent tuples at once lowers the per-tuple cost of compression, both in terms of CPU and space overheads.In this paper, we discuss how we extended C-Store (a column-oriented DBMS) with a compression sub-system. We show how compression schemes not traditionally used in row-oriented DBMSs can be applied to column-oriented systems. We then evaluate a set of compression schemes and show that the best scheme depends not only on the properties of the data but also on the nature of the query workload. Copyright 2006 ACM.},
author = {Abadi, Daniel and Madden, Samuel and Ferreira, Miguel},
booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data},
doi = {10.1145/1142473.1142548},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Abadi, Madden, Ferreira - 2006 - Integrating compression and execution in column-oriented database systems.pdf:pdf},
isbn = {1595934340},
issn = {07308078},
keywords = {Column-oriented databases,Column-stores,Database compression,Query execution},
pages = {671--682},
title = {{Integrating compression and execution in column-oriented database systems}},
year = {2006}
}
@article{Hildebrandt2017,
abstract = {Lossless lightweight data compression is a very important optimization technique in various application domains like database systems, information retrieval or machine learning. Despite this importance, currently, there exists no comprehensive and non-Technical abstraction. To overcome this issue, we have developed a systematic approach using metamodeling that focuses on the non-Technical concepts of these algorithms. In this paper, we describe COLLATE, the metamodel we developed, and show that each algorithm can be described as a model conforming with COLLATE. Furthermore, we use COLLATE to specify a compression algorithm language COALA, so that lightweight data compression algorithms can be specified and modified in a descriptive and abstract way. Additionally, we present an approach to transform such descriptive algorithms into executable code. As we are going to show, our abstract and non-Technical approach offers several advantages.},
author = {Hildebrandt, Juliana and Habich, Dirk and Kuhn, Thomas and Damme, Patrick and Lehner, Wolfgang},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Hildebrandt et al. - 2017 - Metamodeling lightweight data compression algorithms and its application scenarios.pdf:pdf},
issn = {16130073},
journal = {CEUR Workshop Proceedings},
pages = {128--141},
title = {{Metamodeling lightweight data compression algorithms and its application scenarios}},
volume = {1979},
year = {2017}
}
@article{Woltmann2021,
title = {{Learned Selection Strategy for Lightweight Integer Compression Algorithms}},
author = {Woltmann, Lucas and Hartmann, Claudio and Habich, Dirk and Lehner, Wolfgang and Damme, Patrick},
journal = {preprint},
publisher = {ACM, New York, NY, USA},
year = {2021}
}
@inproceedings{JahanLisa2019,
abstract = {To efficiently support analytical applications from a data management perspective, in-memory column store database systems are state-of-the art. In this kind of database system, lossless lightweight integer compression schemes are crucial to keep the memory storage as low as possible and to speedup query processing. In this specific compression domain, BitPacking is one of the most frequently applied compression scheme. However, (de) compression should not come with any additional cost during run time, but should be provided transparently without compromising the overall system performance. To achieve that, we focus on acceleration of BitPacking using Field Programmable Gate Arrays (FPGAs). Therefore, we outline several FPGA designs for BitPacking in this paper. As we are going to show in our evaluation, our specific designs provide the BitPacking compression scheme with high-throughput.},
author = {{Jahan Lisa}, Nusrat and Nguyen, Tuan Duy Anh and Habich, DIrk and Kumar, Akash and Lehner, Wolfgang},
booktitle = {Proceedings - Euromicro Conference on Digital System Design, DSD 2019},
doi = {10.1109/DSD.2019.00101},
file = {:home/moritz/Downloads/High-Throughput_BitPacking_Compression.pdf:pdf},
isbn = {9781728128610},
keywords = {BitPacking,FPGA,in-memory database systems,lightweight data compression},
month = {aug},
pages = {643--646},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{High-Throughput BitPacking Compression}},
year = {2019}
}
@inproceedings{Mohsen2020,
abstract = {An efficient compression of integer vectors is critical in dictionary-encoded column stores like SAP HANA to keep more data in the limited and precious main memory. Past research focused on lightweight compression techniques that trade low latency of data accesses for lower compression ratios. Consequently, only few columns in a wide table benefit from light-weight and effective compression schemes like run-length encoding, prefix compression or sparse encoding. Besides bit-packing, other columns remained uncompressed, which clearly misses opportunities for a better compression ratio for many columns. Furthermore, the main executor for compression was the CPU as compression involves heavy data transfer. Especially when used with co-processors, the data transfer overhead wipes out performance gains from co-processor usage. In this paper, we investigate whether we can achieve good compression ratios even for previously uncompressed columns by using binary packing and prefix suppression offloaded to an FPGA. As a streaming-processor, an FPGA is the perfect candidate to outsource the compression task. As a result of our OpenCL-based implementation, we achieve a saturation of the available PCIe bus during compression on the FPGA, by using less than a third the FPGA's resources. Furthermore, our real-world experiments against CPU-based SAP HANA shows a performance improvement of around a factor of 2 in compression throughput while compressing the data down to 60% of the best SAP HANA compression technique.},
address = {New York, NY, USA},
author = {Mohsen, Mahmoud and May, Norman and F{\"{a}}rber, Christian and Broneske, David},
booktitle = {Proceedings of the 16th International Workshop on Data Management on New Hardware, DaMoN 2020},
doi = {10.1145/3399666.3399932},
file = {::},
isbn = {9781450380249},
keywords = {FPGA,binary packing,compression},
publisher = {ACM},
title = {{FPGA-Accelerated compression of integer vectors}},
url = {https://doi.org/10.1145/3399666.3399932},
year = {2020}
}
@article{Damme2020,
abstract = {In this paper, we present MorphStore, an open-source inmemory columnar analytical query engine with a novel holis-tic compression-enabled processing model. Basically, compression using lightweight integer compression algorithms already plays an important role in existing in-memory columnstore database systems, but mainly for base data. In particular, during query processing, these systems only keep the data compressed until an operator cannot process the compressed data directly, whereupon the data is decompressed, but not recompressed. Thus, the full potential of compression during query processing is not exploited. To overcome that, we developed a novel compression-enabled processing model as presented in this paper. As we are going to show, the continuous usage of compression for all base data and all intermediates is very beneficial to reduce the overall memory footprint as well as to improve the query performance.},
archivePrefix = {arXiv},
arxivId = {2004.09350},
author = {Damme, Patrick and Ungeth{\"{u}}m, Annett and Pietrzyk, Johannes and Krause, Alexander and Habich, Dirk and Lehner, Wolfgang},
doi = {10.14778/3407790.3407833},
eprint = {2004.09350},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Damme et al. - 2020 - MorphStore Analytical Query Engine with a Holistic Compression-Enabled Processing Model(2).pdf:pdf},
issn = {21508097},
journal = {Proceedings of the VLDB Endowment},
month = {apr},
number = {11},
pages = {2396--2410},
title = {{Morphstore: Analytical query engine with a holistic compression-enabled processing model}},
url = {https://arxiv.org/abs/2004.09350v1},
volume = {13},
year = {2020}
}
@article{Kristo2020,
abstract = {Sorting is one of the most fundamental algorithms in Computer Science and a common operation in databases not just for sorting query results but also as part of joins (i.e., sort-merge-join) or indexing. In this work, we introduce a new type of distribution sort that leverages a learned model of the empirical CDF of the data. Our algorithm uses a model to efficiently get an approximation of the scaled empirical CDF for each record key and map it to the corresponding position in the output array. We then apply a deterministic sorting algorithm that works well on nearly-sorted arrays (e.g., Insertion Sort) to establish a totally sorted order. We compared this algorithm against common sorting approaches and measured its performance for up to 1 billion normally-distributed double-precision keys. The results show that our approach yields an average 3.38x performance improvement over C++ STL sort, which is an optimized Quicksort hybrid, 1.49x improvement over sequential Radix Sort, and 5.54x improvement over a C++ implementation of Timsort, which is the default sorting function for Java and Python.},
author = {Kristo, Ani and Vaidya, Kapil and {\c{C}}etintemel, Ugur and Misra, Sanchit and Kraska, Tim},
doi = {10.1145/3318464.3389752},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Kristo et al. - 2020 - The Case for a Learned Sorting Algorithm.pdf:pdf},
journal = {Proceedings of the ACM SIGMOD International Conference on Management of Data},
keywords = {CDF,ML for systems,RMI,learned algorithm,linear interpolation,linear models,sorting,sorting algorithm},
month = {jun},
pages = {1001--1016},
publisher = {Association for Computing Machinery},
title = {{The Case for a Learned Sorting Algorithm}},
url = {https://doi.org/10.1145/3318464.},
year = {2020}
}
@inproceedings{Kipf,
abstract = {We describe a new deep learning approach to cardinality estimation. MSCN is a multi-set convolutional network, tailored to representing relational query plans, that employs set semantics to capture query features and true cardinalities. MSCN builds on sampling-based estimation, addressing its weaknesses when no sampled tuples qualify a predicate, and in capturing join-crossing correlations. Our evaluation of MSCN using a real-world dataset shows that deep learning signiicantly enhances the quality of cardinality estimation, which is the core problem in query optimization.},
archivePrefix = {arXiv},
arxivId = {1809.00677},
author = {Kipf, Andreas and Kipf, Thomas and Radke, Bernhard and Leis, Viktor and Boncz, Peter and Kemper, Alfons},
booktitle = {CIDR 2019 - 9th Biennial Conference on Innovative Data Systems Research},
eprint = {1809.00677},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Kipf et al. - Unknown - Learned Cardinalities Estimating Correlated Joins with Deep Learning(2).pdf:pdf},
title = {{Learned cardinalities: Estimating correlated joins with deep learning}},
year = {2019}
}
@inproceedings{Jin2019,
abstract = {Data compression is a key part of database management systems for storage saving and performance enhancement. In column-oriented databases, records belong to the same attribute are stored nearby, and the similarity between these records increases the compressibility of data and expands the range of compression algorithms to choose. Since different data compression algorithms process data in different manners, the achieved compression ratio varies significantly. This makes it worth studying the choice of compression algorithms depending on features of data to be compressed. As Recurrent Neural Networks is good at processing and making predictions based on series of data, we propose a Long-Short Term Memory network based model to select compression algorithm for input data blocks adaptively. Given a typical database benchmark, we implemented our model to formulate compression strategies for each data block and managed to reduce at most 15% storage size than using a single compression algorithm scheme.},
author = {Jin, Yingting and Fu, Yuzhuo and Liu, Ting and Dong, Lan},
booktitle = {Proceedings of 2019 IEEE 3rd Information Technology, Networking, Electronic and Automation Control Conference, ITNEC 2019},
doi = {10.1109/ITNEC.2019.8729341},
file = {:home/moritz/Downloads/Adaptive_Compression_Algorithm_Selection_Using_LSTM_Network_in_Column-oriented_Database.pdf:pdf},
isbn = {9781538662434},
keywords = {Adaptive algorithm,Column-store,Data compression,Long-Short Term Memory},
month = {mar},
pages = {652--656},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Adaptive compression algorithm selection using LSTM network in column-oriented database}},
year = {2019}
}
@inproceedings{Boissier,
abstract = {Modern main memory-optimized column stores employ a variety of compression techniques. Deciding for one compression technique over others for a given memory budget can be challenging since each technique has different trade-offs whose impact on large workloads is not obvious. We present an automated selection framework for compression configurations. Most database systems provide means to automatically choose a compression configuration but lack two crucial properties: The compression selection cannot be constrained (e.g., by a given storage budget) and robustness of the compression configuration is not considered. Our approach uses workload information to determine robust configurations under the given constraints. The runtime performance of the various compression techniques is estimated using adapted regression models.},
author = {Boissier, Martin and Jendruk, Max},
booktitle = {Advances in Database Technology - EDBT},
doi = {10.5441/002/edbt.2019.84},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Boissier, Jendruk - Unknown - Workload-Driven and Robust Selection of Compression Schemes for Column Stores(2).pdf:pdf},
isbn = {9783893180813},
issn = {23672005},
pages = {674--677},
title = {{Workload-driven and robust selection of compression schemes for column stores}},
url = {https://boostorg.github.io/hana/},
volume = {2019-March},
year = {2019}
}
@inproceedings{Heaton2016,
abstract = {Machine learning models, such as neural networks, decision trees, random forests and gradient boosting machines accept a feature vector and provide a prediction. These models learn in a supervised fashion where a set of feature vectors with expected output is provided. It is very common practice to engineer new features from the provided feature set. Such engineered features will either augment, or replace portions of the existing feature vector. These engineered features are essentially calculated fields, based on the values of the other features. Engineering such features is primarily a manual, time-consuming task. Additionally, each type of model will respond differently to different types of engineered features. This paper reports on empirical research to demonstrate what types of engineered features are best suited to which machine learning model type. This is accomplished by generating several datasets that are designed to benefit from a particular type of engineered feature. The experiment demonstrates to what degree the machine learning model is capable of synthesizing the needed feature on its own. If a model is capable of synthesizing an engineered feature, it is not necessary to provide that feature. The research demonstrated that the studied models do indeed perform differently with various types of engineered features.},
archivePrefix = {arXiv},
arxivId = {1701.07852},
author = {Heaton, Jeff},
booktitle = {Conference Proceedings - IEEE SOUTHEASTCON},
doi = {10.1109/SECON.2016.7506650},
eprint = {1701.07852},
file = {:home/moritz/Downloads/An_empirical_analysis_of_feature_engineering_for_predictive_modeling.pdf:pdf},
isbn = {9781509022465},
issn = {07347502},
month = {jul},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{An empirical analysis of feature engineering for predictive modeling}},
volume = {2016-July},
year = {2016}
}
@article{Bengio2013,
abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, autoencoders, manifold learning, and deep networks. This motivates longer term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation, and manifold learning. {\textcopyright} 1979-2012 IEEE.},
archivePrefix = {arXiv},
arxivId = {1206.5538},
author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
doi = {10.1109/TPAMI.2013.50},
eprint = {1206.5538},
file = {:home/moritz/Downloads/Representation_Learning_A_Review_and_New_Perspectives.pdf:pdf},
issn = {01628828},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Boltzmann machine,Deep learning,autoencoder,feature learning,neural nets,representation learning,unsupervised learning},
number = {8},
pages = {1798--1828},
pmid = {23787338},
title = {{Representation learning: A review and new perspectives}},
volume = {35},
year = {2013}
}
@article{Ghit2020,
abstract = {We formulate a conceptual model for white-box compression, which represents the logical columns in tabular data as an openly defined function over some actually stored physical columns. Each block of data should thus go accompanied by a header that describes this functional mapping. Because these compression functions are openly defined, database systems can exploit them using query optimization and during execution, enabling e.g. better filter predicate push-down. In addition, we show that white-box compression is able to identify a broad variety of new opportunities for compression, leading to much better compression factors. These opportunities are identified using an automatic learning process that learns the functions from the data. We provide a recursive pattern-driven algorithm for such learning. Finally, we demonstrate the effectiveness of white-box compression on a new benchmark we contribute hereby: the Public BI benchmark provides a rich set of real-world datasets. We believe our basic prototype for white-box compression opens the way for future research into transparent compressed data representations on the one hand and database system architectures that can efficiently exploit these on the other, and should be seen as another step into the direction of data management systems that are self-learning and optimize themselves for the data they are deployed on.},
author = {Ghit, Bogdan and Amsterdam, Cwi and {Diego Tom{\'{e}}}, NL and Boncz, Peter},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ghit et al. - Unknown - White-box Compression Learning and Exploiting Compact Table Representations.pdf:pdf},
journal = {Cidr},
title = {{White-box Compression: Learning and Exploiting Compact Table Representations}},
year = {2020}
}
@article{Breiman2013,
abstract = {Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund & R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, * * * , 148-156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression.},
author = {Breiman, Leo},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Breiman - 2001 - Random Forests.pdf:pdf},
journal = {Machine Learning},
keywords = {classification,ensemble,regression},
pages = {5--32},
title = {{Random Forests}},
volume = {45},
year = {2001}
}

@inproceedings{Lundberg2017,
abstract = {Understanding why a model makes a certain prediction can be as crucial as the prediction's accuracy in many applications. However, the highest accuracy for large modern datasets is often achieved by complex models that even experts struggle to interpret, such as ensemble or deep learning models, creating a tension between accuracy and interpretability. In response, various methods have recently been proposed to help users interpret the predictions of complex models, but it is often unclear how these methods are related and when one method is preferable over another. To address this problem, we present a unified framework for interpreting predictions, SHAP (SHapley Additive exPlanations). SHAP assigns each feature an importance value for a particular prediction. Its novel components include: (1) the identification of a new class of additive feature importance measures, and (2) theoretical results showing there is a unique solution in this class with a set of desirable properties. The new class unifies six existing methods, notable because several recent methods in the class lack the proposed desirable properties. Based on insights from this unification, we present new methods that show improved computational performance and/or better consistency with human intuition than previous approaches.},
archivePrefix = {arXiv},
arxivId = {1705.07874},
author = {Lundberg, Scott M. and Lee, Su In},
booktitle = {Advances in Neural Information Processing Systems},
eprint = {1705.07874},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lundberg, Lee - 2017 - A Unified Approach to Interpreting Model Predictions.pdf:pdf},
issn = {10495258},
pages = {4766--4775},
title = {{A unified approach to interpreting model predictions}},
url = {https://github.com/slundberg/shap},
volume = {2017-Decem},
year = {2017}
}
@unpublished{Reinsel2018,
abstract = {IDC predicts that the Global Datasphere will grow from 175 Zettabytes by 2025},
author = {Reinsel, David and Gantz, John and Rydning, John},
booktitle = {The Digitization of the World From Edge to Core},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Reinsel, Gantz, Rydning - 2018 - The Digitization of the World From Edge to Core.pdf:pdf},
institution = {Seagate Technology, IDC},
keywords = {IDC,Seagate},
publisher = {Seagate Technology, IDC},
title = {{Data Age 2025: The Digitization of the World From Edge to Core}},
url = {https://www.seagate.com/files/www-content/our-story/trends/files/idc-seagate-dataage-whitepaper.pdf},
year = {2018}
}
@article{Elgohary2016,
abstract = {Large-scale machine learning (ML) algorithms are often-iterative, using repeated read-only data access and I/Obound-matrix-vector multiplications to converge to an optimal-model. It is crucial for performance to fit the data into-single-node or distributed main memory. General-purpose,-heavy- and lightweight compression techniques struggle to-achieve both good compression ratios and fast decompression-speed to enable block-wise uncompressed operations.-Hence, we initiate work on compressed linear algebra (CLA),-in which lightweight database compression techniques are-applied to matrices and then linear algebra operations such-as matrix-vector multiplication are executed directly on the-compressed representations. We contribute effective column-compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments-show that CLA achieves in-memory operations performance-close to the uncompressed case and good compression-ratios that allow us to fit larger datasets into available memory.-We thereby obtain significant end-to-end performance-improvements up to 26x or reduced memory requirements.},
author = {Elgohary, Ahmed and Boehm, Matthias and Haas, Peter J. and Reiss, Frederick R. and Reinwald, Berthold},
doi = {10.14778/2994509.2994515},
journal = {Proceedings of the VLDB Endowment},
number = {12},
pages = {960--971},
publisher = {Association for Computing Machinery},
title = {{Compressed linear algebra for largescale machine learning}},
volume = {9},
year = {2016}
}
@inproceedings{Stonebraker,
abstract = {This paper presents the design of a read-optimized relational DBMS that contrasts sharply with most current systems, which are write-optimized. Among the many differences in its design are: storage of data by column rather than by row, careful coding and packing of objects into storage including main memory during query processing, storing an overlapping collection of column-oriented projections, rather than the current fare of tables and indexes, a non-traditional implementation of transactions which includes high availability and snapshot isolation for read-only transactions, and the extensive use of bitmap indexes to complement B-tree structures. We present preliminary performance data on a subset of TPC-H and show that the system we are building, C-Store, is substantially faster than popular commercial products. Hence, the architecture looks very encouraging.},
address = {Trondheim, Norway},
author = {Stonebraker, Mike and Abadi, Daniel J and Batkin, Adam and Chen, Xuedong and Cherniack, Mitch and Ferreira, Miguel and Lau, Edmond and Lin, Amerson and Madden, Sam and O'neil, Elizabeth and O'neil, Pat and Rasin, Alex and Tran, Nga and Zdonik, Stan},
booktitle = {Proceedings of the 31st International Conference on Very Large Data Bases},
file = {:home/moritz/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Stonebraker et al. - Unknown - C-Store A Column-oriented DBMS(2).pdf:pdf},
pages = {553--564},
publisher = {ACM},
title = {{C-Store: A Column-oriented DBMS}},
year = {2005}
}