Last active
December 20, 2015 06:39
-
-
Save kaspersorensen/6087230 to your computer and use it in GitHub Desktop.
Proposed improvement to MetaModel CSV performance, specifically for CSV files with only single-line values.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| From 72d0608b2ba1dd3878bcef3eb51e6369c4ccdaf3 Fri, 26 Jul 2013 10:45:14 +0200 | |
| From: kaspers <kaspers@kaspers-think.humaninference.com> | |
| Date: Fri, 26 Jul 2013 10:44:29 +0200 | |
| Subject: [PATCH] Performance improvement to CSV reading when values are only single line based. | |
| diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java b/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java | |
| index e2cd846..1662eab 100644 | |
| --- a/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java | |
| +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java | |
| @@ -31,130 +31,148 @@ | |
| */ | |
| public final class CsvConfiguration extends BaseObject implements Serializable { | |
| - private static final long serialVersionUID = 1L; | |
| + private static final long serialVersionUID = 1L; | |
| - /** | |
| - * The value is '\\uFFFF', the "not a character" value which should not | |
| - * occur in any valid Unicode string. This special char can be used to | |
| - * disable either quote chars or escape chars. | |
| - */ | |
| - public static final char NOT_A_CHAR = '\uFFFF'; | |
| - public static final int NO_COLUMN_NAME_LINE = 0; | |
| - public static final int DEFAULT_COLUMN_NAME_LINE = 1; | |
| - public static final char DEFAULT_SEPARATOR_CHAR = ','; | |
| - public static final char DEFAULT_QUOTE_CHAR = '"'; | |
| - public static final char DEFAULT_ESCAPE_CHAR = '\\'; | |
| + /** | |
| + * The value is '\\uFFFF', the "not a character" value which should not | |
| + * occur in any valid Unicode string. This special char can be used to | |
| + * disable either quote chars or escape chars. | |
| + */ | |
| + public static final char NOT_A_CHAR = '\uFFFF'; | |
| + public static final int NO_COLUMN_NAME_LINE = 0; | |
| + public static final int DEFAULT_COLUMN_NAME_LINE = 1; | |
| + public static final char DEFAULT_SEPARATOR_CHAR = ','; | |
| + public static final char DEFAULT_QUOTE_CHAR = '"'; | |
| + public static final char DEFAULT_ESCAPE_CHAR = '\\'; | |
| - private final int columnNameLineNumber; | |
| - private final String encoding; | |
| - private final char separatorChar; | |
| - private final char quoteChar; | |
| - private final char escapeChar; | |
| - private final boolean failOnInconsistentRowLength; | |
| + private final int columnNameLineNumber; | |
| + private final String encoding; | |
| + private final char separatorChar; | |
| + private final char quoteChar; | |
| + private final char escapeChar; | |
| + private final boolean failOnInconsistentRowLength; | |
| + private final boolean multilineValues; | |
| - public CsvConfiguration() { | |
| - this(DEFAULT_COLUMN_NAME_LINE); | |
| - } | |
| + public CsvConfiguration() { | |
| + this(DEFAULT_COLUMN_NAME_LINE); | |
| + } | |
| - public CsvConfiguration(int columnNameLineNumber) { | |
| - this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING, | |
| - DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR, DEFAULT_ESCAPE_CHAR); | |
| - } | |
| + public CsvConfiguration(int columnNameLineNumber) { | |
| + this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING, DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR, | |
| + DEFAULT_ESCAPE_CHAR); | |
| + } | |
| - public CsvConfiguration(int columnNameLineNumber, String encoding, | |
| - char separatorChar, char quoteChar, char escapeChar) { | |
| - this(columnNameLineNumber, encoding, separatorChar, quoteChar, | |
| - escapeChar, false); | |
| - } | |
| + public CsvConfiguration(int columnNameLineNumber, boolean failOnInconsistentRowLength, boolean multilineValues) { | |
| + this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING, DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR, | |
| + DEFAULT_ESCAPE_CHAR, failOnInconsistentRowLength, multilineValues); | |
| + } | |
| - public CsvConfiguration(int columnNameLineNumber, String encoding, | |
| - char separatorChar, char quoteChar, char escapeChar, | |
| - boolean failOnInconsistentRowLength) { | |
| - this.columnNameLineNumber = columnNameLineNumber; | |
| - this.encoding = encoding; | |
| - this.separatorChar = separatorChar; | |
| - this.quoteChar = quoteChar; | |
| - this.escapeChar = escapeChar; | |
| - this.failOnInconsistentRowLength = failOnInconsistentRowLength; | |
| - } | |
| + public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar, | |
| + char escapeChar) { | |
| + this(columnNameLineNumber, encoding, separatorChar, quoteChar, escapeChar, false); | |
| + } | |
| - /** | |
| - * Determines whether to fail (by throwing an | |
| - * {@link InconsistentRowLengthException}) if a line in the CSV file has | |
| - * inconsistent amounts of columns. | |
| - * | |
| - * If set to false (default) MetaModel will gracefully fill in missing null | |
| - * values in or ignore additional values in a line. | |
| - * | |
| - * @return a boolean indicating whether to fail or gracefully compensate for | |
| - * inconsistent lines in the CSV files. | |
| - */ | |
| - public boolean isFailOnInconsistentRowLength() { | |
| - return failOnInconsistentRowLength; | |
| - } | |
| + public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar, | |
| + char escapeChar, boolean failOnInconsistentRowLength) { | |
| + this(columnNameLineNumber, encoding, separatorChar, quoteChar, escapeChar, failOnInconsistentRowLength, true); | |
| + } | |
| - /** | |
| - * The line number (1 based) from which to get the names of the columns. | |
| - * | |
| - * @return the line number (1 based) | |
| - */ | |
| - public int getColumnNameLineNumber() { | |
| - return columnNameLineNumber; | |
| - } | |
| + public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar, | |
| + char escapeChar, boolean failOnInconsistentRowLength, boolean multilineValues) { | |
| + this.columnNameLineNumber = columnNameLineNumber; | |
| + this.encoding = encoding; | |
| + this.separatorChar = separatorChar; | |
| + this.quoteChar = quoteChar; | |
| + this.escapeChar = escapeChar; | |
| + this.failOnInconsistentRowLength = failOnInconsistentRowLength; | |
| + this.multilineValues = multilineValues; | |
| + } | |
| - /** | |
| - * Gets the file encoding to use for reading the file. | |
| - * | |
| - * @return the text encoding of the file. | |
| - */ | |
| - public String getEncoding() { | |
| - return encoding; | |
| - } | |
| + /** | |
| + * Determines whether to fail (by throwing an | |
| + * {@link InconsistentRowLengthException}) if a line in the CSV file has | |
| + * inconsistent amounts of columns. | |
| + * | |
| + * If set to false (default) MetaModel will gracefully fill in missing null | |
| + * values in or ignore additional values in a line. | |
| + * | |
| + * @return a boolean indicating whether to fail or gracefully compensate for | |
| + * inconsistent lines in the CSV files. | |
| + */ | |
| + public boolean isFailOnInconsistentRowLength() { | |
| + return failOnInconsistentRowLength; | |
| + } | |
| - /** | |
| - * Gets the separator char (typically comma or semicolon) for separating | |
| - * values. | |
| - * | |
| - * @return the separator char | |
| - */ | |
| - public char getSeparatorChar() { | |
| - return separatorChar; | |
| - } | |
| + /** | |
| + * Determines whether the CSV files read using this configuration should be | |
| + * allowed to have multiline values in them. | |
| + * | |
| + * @return | |
| + */ | |
| + public boolean isMultilineValues() { | |
| + return multilineValues; | |
| + } | |
| - /** | |
| - * Gets the quote char, used for encapsulating values. | |
| - * | |
| - * @return the quote char | |
| - */ | |
| - public char getQuoteChar() { | |
| - return quoteChar; | |
| - } | |
| + /** | |
| + * The line number (1 based) from which to get the names of the columns. | |
| + * | |
| + * @return the line number (1 based) | |
| + */ | |
| + public int getColumnNameLineNumber() { | |
| + return columnNameLineNumber; | |
| + } | |
| - /** | |
| - * Gets the escape char, used for escaping eg. quote chars inside values. | |
| - * | |
| - * @return the escape char | |
| - */ | |
| - public char getEscapeChar() { | |
| - return escapeChar; | |
| - } | |
| + /** | |
| + * Gets the file encoding to use for reading the file. | |
| + * | |
| + * @return the text encoding of the file. | |
| + */ | |
| + public String getEncoding() { | |
| + return encoding; | |
| + } | |
| - @Override | |
| - protected void decorateIdentity(List<Object> identifiers) { | |
| - identifiers.add(columnNameLineNumber); | |
| - identifiers.add(encoding); | |
| - identifiers.add(separatorChar); | |
| - identifiers.add(quoteChar); | |
| - identifiers.add(escapeChar); | |
| - identifiers.add(failOnInconsistentRowLength); | |
| - } | |
| + /** | |
| + * Gets the separator char (typically comma or semicolon) for separating | |
| + * values. | |
| + * | |
| + * @return the separator char | |
| + */ | |
| + public char getSeparatorChar() { | |
| + return separatorChar; | |
| + } | |
| - @Override | |
| - public String toString() { | |
| - return "CsvConfiguration[columnNameLineNumber=" + columnNameLineNumber | |
| - + ", encoding=" + encoding + ", separatorChar=" + separatorChar | |
| - + ", quoteChar=" + quoteChar + ", escapeChar=" + escapeChar | |
| - + ", failOnInconsistentRowLength=" | |
| - + failOnInconsistentRowLength + "]"; | |
| - } | |
| + /** | |
| + * Gets the quote char, used for encapsulating values. | |
| + * | |
| + * @return the quote char | |
| + */ | |
| + public char getQuoteChar() { | |
| + return quoteChar; | |
| + } | |
| + | |
| + /** | |
| + * Gets the escape char, used for escaping eg. quote chars inside values. | |
| + * | |
| + * @return the escape char | |
| + */ | |
| + public char getEscapeChar() { | |
| + return escapeChar; | |
| + } | |
| + | |
| + @Override | |
| + protected void decorateIdentity(List<Object> identifiers) { | |
| + identifiers.add(columnNameLineNumber); | |
| + identifiers.add(encoding); | |
| + identifiers.add(separatorChar); | |
| + identifiers.add(quoteChar); | |
| + identifiers.add(escapeChar); | |
| + identifiers.add(failOnInconsistentRowLength); | |
| + } | |
| + | |
| + @Override | |
| + public String toString() { | |
| + return "CsvConfiguration[columnNameLineNumber=" + columnNameLineNumber + ", encoding=" + encoding | |
| + + ", separatorChar=" + separatorChar + ", quoteChar=" + quoteChar + ", escapeChar=" + escapeChar | |
| + + ", failOnInconsistentRowLength=" + failOnInconsistentRowLength + "]"; | |
| + } | |
| } | |
| diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java | |
| index 2b275a9..5effa2c 100644 | |
| --- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java | |
| +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java | |
| @@ -35,6 +35,7 @@ | |
| import org.apache.metamodel.UpdateScript; | |
| import org.apache.metamodel.UpdateableDataContext; | |
| import org.apache.metamodel.data.DataSet; | |
| +import org.apache.metamodel.data.EmptyDataSet; | |
| import org.apache.metamodel.query.FilterItem; | |
| import org.apache.metamodel.schema.Column; | |
| import org.apache.metamodel.schema.Table; | |
| @@ -46,6 +47,7 @@ | |
| import org.slf4j.Logger; | |
| import org.slf4j.LoggerFactory; | |
| +import au.com.bytecode.opencsv.CSVParser; | |
| import au.com.bytecode.opencsv.CSVReader; | |
| /** | |
| @@ -336,23 +338,50 @@ | |
| @Override | |
| public DataSet materializeMainSchemaTable(Table table, Column[] columns, int maxRows) { | |
| final int lineNumber = _configuration.getColumnNameLineNumber(); | |
| - final CSVReader reader = createCsvReader(lineNumber); | |
| final int columnCount = table.getColumnCount(); | |
| - final boolean failOnInconsistentRowLength = _configuration.isFailOnInconsistentRowLength(); | |
| - if (maxRows < 0) { | |
| - return new CsvDataSet(reader, columns, null, columnCount, failOnInconsistentRowLength); | |
| - } else { | |
| - return new CsvDataSet(reader, columns, maxRows, columnCount, failOnInconsistentRowLength); | |
| + | |
| + final BufferedReader reader = FileHelper.getBufferedReader(_resource.read(), _configuration.getEncoding()); | |
| + | |
| + try { | |
| + // skip column header lines | |
| + for (int i = 0; i < lineNumber; i++) { | |
| + String line = reader.readLine(); | |
| + if (line == null) { | |
| + return new EmptyDataSet(columns); | |
| + } | |
| + } | |
| + } catch (IOException e) { | |
| + throw new MetaModelException("IOException occurred while reading from CSV resource: " + _resource, e); | |
| } | |
| + | |
| + final boolean failOnInconsistentRowLength = _configuration.isFailOnInconsistentRowLength(); | |
| + | |
| + final Integer maxRowsOrNull = (maxRows > 0 ? maxRows : null); | |
| + | |
| + if (_configuration.isMultilineValues()) { | |
| + final CSVReader csvReader = createCsvReader(reader); | |
| + return new CsvDataSet(csvReader, columns, maxRowsOrNull, columnCount, failOnInconsistentRowLength); | |
| + } | |
| + | |
| + final CSVParser csvParser = new CSVParser(_configuration.getSeparatorChar(), _configuration.getQuoteChar(), | |
| + _configuration.getEscapeChar()); | |
| + return new SingleLineCsvDataSet(reader, csvParser, columns, maxRowsOrNull, columnCount, | |
| + failOnInconsistentRowLength); | |
| } | |
| protected CSVReader createCsvReader(int skipLines) { | |
| - final Reader fileReader = FileHelper.getReader(_resource.read(), _configuration.getEncoding()); | |
| - final CSVReader csvReader = new CSVReader(fileReader, _configuration.getSeparatorChar(), | |
| + final Reader reader = FileHelper.getReader(_resource.read(), _configuration.getEncoding()); | |
| + final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(), | |
| _configuration.getQuoteChar(), _configuration.getEscapeChar(), skipLines); | |
| return csvReader; | |
| } | |
| + protected CSVReader createCsvReader(BufferedReader reader) { | |
| + final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(), | |
| + _configuration.getQuoteChar(), _configuration.getEscapeChar()); | |
| + return csvReader; | |
| + } | |
| + | |
| @Override | |
| protected CsvSchema getMainSchema() throws MetaModelException { | |
| CsvSchema schema = new CsvSchema(getMainSchemaName(), this); | |
| diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java | |
| index b69ab13..4bfb18c 100644 | |
| --- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java | |
| +++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java | |
| @@ -83,7 +83,7 @@ | |
| return false; | |
| } | |
| } | |
| - | |
| + | |
| private boolean nextInternal() { | |
| if (_reader == null) { | |
| return false; | |
| diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java | |
| new file mode 100644 | |
| index 0000000..2d8a800 | |
| --- /dev/null | |
| +++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java | |
| @@ -0,0 +1,124 @@ | |
| +/** | |
| + * Licensed to the Apache Software Foundation (ASF) under one | |
| + * or more contributor license agreements. See the NOTICE file | |
| + * distributed with this work for additional information | |
| + * regarding copyright ownership. The ASF licenses this file | |
| + * to you under the Apache License, Version 2.0 (the | |
| + * "License"); you may not use this file except in compliance | |
| + * with the License. You may obtain a copy of the License at | |
| + * | |
| + * http://www.apache.org/licenses/LICENSE-2.0 | |
| + * | |
| + * Unless required by applicable law or agreed to in writing, | |
| + * software distributed under the License is distributed on an | |
| + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
| + * KIND, either express or implied. See the License for the | |
| + * specific language governing permissions and limitations | |
| + * under the License. | |
| + */ | |
| +package org.apache.metamodel.csv; | |
| + | |
| +import java.io.BufferedReader; | |
| +import java.io.IOException; | |
| + | |
| +import org.apache.metamodel.MetaModelException; | |
| +import org.apache.metamodel.data.AbstractDataSet; | |
| +import org.apache.metamodel.data.DataSetHeader; | |
| +import org.apache.metamodel.data.Row; | |
| +import org.apache.metamodel.schema.Column; | |
| +import org.apache.metamodel.util.FileHelper; | |
| + | |
| +import au.com.bytecode.opencsv.CSVParser; | |
| + | |
| +/** | |
| + * A specialized DataSet implementation for the CSV module under circumstances | |
| + * where multiline values are disabled. In this case we can use a optimized | |
| + * CSVParser and also lazy evaluate lines read from the file. | |
| + */ | |
| +final class SingleLineCsvDataSet extends AbstractDataSet { | |
| + | |
| + private final BufferedReader _reader; | |
| + private final CSVParser _csvParser; | |
| + private final int _columnsInTable; | |
| + private final boolean _failOnInconsistentRowLength; | |
| + | |
| + private volatile int _rowNumber; | |
| + private volatile Integer _rowsRemaining; | |
| + private volatile Row _row; | |
| + | |
| + public SingleLineCsvDataSet(BufferedReader reader, CSVParser csvParser, Column[] columns, Integer maxRows, | |
| + int columnsInTable, boolean failOnInconsistentRowLength) { | |
| + super(columns); | |
| + _reader = reader; | |
| + _csvParser = csvParser; | |
| + _columnsInTable = columnsInTable; | |
| + _failOnInconsistentRowLength = failOnInconsistentRowLength; | |
| + _rowNumber = 0; | |
| + _rowsRemaining = maxRows; | |
| + } | |
| + | |
| + @Override | |
| + public void close() { | |
| + FileHelper.safeClose(_reader); | |
| + _row = null; | |
| + _rowsRemaining = null; | |
| + } | |
| + | |
| + @Override | |
| + public boolean next() { | |
| + if (_rowsRemaining != null && _rowsRemaining > 0) { | |
| + _rowsRemaining--; | |
| + return nextInternal(); | |
| + } else if (_rowsRemaining == null) { | |
| + return nextInternal(); | |
| + } else { | |
| + return false; | |
| + } | |
| + } | |
| + | |
| + @Override | |
| + protected DataSetHeader getHeader() { | |
| + // re-make this method protected so that it's visible for | |
| + // SingleLineCsvRow. | |
| + return super.getHeader(); | |
| + } | |
| + | |
| + protected boolean isFailOnInconsistentRowLength() { | |
| + return _failOnInconsistentRowLength; | |
| + } | |
| + | |
| + protected int getColumnsInTable() { | |
| + return _columnsInTable; | |
| + } | |
| + | |
| + protected CSVParser getCsvParser() { | |
| + return _csvParser; | |
| + } | |
| + | |
| + public boolean nextInternal() { | |
| + if (_reader == null) { | |
| + return false; | |
| + } | |
| + | |
| + try { | |
| + final String line = _reader.readLine(); | |
| + if (line == null) { | |
| + close(); | |
| + return false; | |
| + } | |
| + | |
| + _rowNumber++; | |
| + _row = new SingleLineCsvRow(this, line, _columnsInTable, _failOnInconsistentRowLength, _rowNumber); | |
| + return true; | |
| + } catch (IOException e) { | |
| + close(); | |
| + throw new MetaModelException("IOException occurred while reading next line of CSV resource", e); | |
| + } | |
| + } | |
| + | |
| + @Override | |
| + public Row getRow() { | |
| + return _row; | |
| + } | |
| + | |
| +} | |
| diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java | |
| new file mode 100644 | |
| index 0000000..ee93974 | |
| --- /dev/null | |
| +++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java | |
| @@ -0,0 +1,57 @@ | |
| +package org.apache.metamodel.csv; | |
| + | |
| +import org.apache.metamodel.data.AbstractRow; | |
| +import org.apache.metamodel.data.DataSetHeader; | |
| +import org.apache.metamodel.data.Style; | |
| +import org.apache.metamodel.util.LazyRef; | |
| + | |
| +import au.com.bytecode.opencsv.CSVParser; | |
| + | |
| +/** | |
| + * Specialized row implementation for single-line CSV values | |
| + */ | |
| +final class SingleLineCsvRow extends AbstractRow { | |
| + | |
| + private static final long serialVersionUID = 1L; | |
| + | |
| + private final SingleLineCsvDataSet _dataSet; | |
| + private final LazyRef<String[]> _valuesRef; | |
| + | |
| + public SingleLineCsvRow(SingleLineCsvDataSet dataSet, final String line, final int columnsInTable, | |
| + final boolean failOnInconsistentRowLength, final int rowNumber) { | |
| + _dataSet = dataSet; | |
| + _valuesRef = new LazyRef<String[]>() { | |
| + @Override | |
| + protected String[] fetch() throws Throwable { | |
| + final CSVParser parser = _dataSet.getCsvParser(); | |
| + final String[] values = parser.parseLine(line); | |
| + | |
| + if (failOnInconsistentRowLength) { | |
| + if (columnsInTable != values.length) { | |
| + throw new InconsistentRowLengthException(columnsInTable, SingleLineCsvRow.this, values, | |
| + rowNumber); | |
| + } | |
| + } | |
| + | |
| + return values; | |
| + } | |
| + }; | |
| + } | |
| + | |
| + @Override | |
| + public Object getValue(int index) throws IndexOutOfBoundsException { | |
| + String[] values = _valuesRef.get(); | |
| + return values[index]; | |
| + } | |
| + | |
| + @Override | |
| + public Style getStyle(int index) throws IndexOutOfBoundsException { | |
| + return Style.NO_STYLE; | |
| + } | |
| + | |
| + @Override | |
| + protected DataSetHeader getHeader() { | |
| + return _dataSet.getHeader(); | |
| + } | |
| + | |
| +} | |
| diff --git a/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java b/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java | |
| index 54c6987..c7243bd 100644 | |
| --- a/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java | |
| +++ b/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java | |
| @@ -19,85 +19,114 @@ | |
| package org.apache.metamodel.csv; | |
| import java.io.File; | |
| +import java.util.concurrent.CountDownLatch; | |
| +import java.util.concurrent.ExecutorService; | |
| +import java.util.concurrent.Executors; | |
| +import java.util.concurrent.atomic.AtomicBoolean; | |
| + | |
| +import junit.framework.TestCase; | |
| import org.apache.metamodel.DataContext; | |
| -import org.apache.metamodel.csv.CsvConfiguration; | |
| -import org.apache.metamodel.csv.CsvDataContext; | |
| import org.apache.metamodel.data.DataSet; | |
| +import org.apache.metamodel.data.Row; | |
| import org.apache.metamodel.query.Query; | |
| import org.apache.metamodel.query.SelectItem; | |
| import org.apache.metamodel.schema.Table; | |
| -import junit.framework.TestCase; | |
| - | |
| public class CsvBigFileMemoryTest extends TestCase { | |
| - private final int hugeFileRows = 3000; | |
| - private final int hugeFileCols = 2000; | |
| + private final int hugeFileRows = 30000; | |
| + private final int hugeFileCols = 2000; | |
| - private File getHugeFile() { | |
| - final File file = new File("target/huge_csv.csv"); | |
| - if (!file.exists()) { | |
| + private File getHugeFile() { | |
| + final File file = new File("target/huge_csv.csv"); | |
| + if (!file.exists()) { | |
| - final ExampleDataGenerator exampleDataGenerator = new ExampleDataGenerator( | |
| - hugeFileRows, hugeFileCols); | |
| - exampleDataGenerator.createFile(file); | |
| - } | |
| - return file; | |
| - } | |
| + final ExampleDataGenerator exampleDataGenerator = new ExampleDataGenerator(hugeFileRows, hugeFileCols); | |
| + exampleDataGenerator.createFile(file); | |
| + } | |
| + return file; | |
| + } | |
| - /** | |
| - * Runs a performance test based on the data created by the | |
| - * ExampleDataCreator utility. | |
| - * | |
| - * @see ExampleDataGenerator | |
| - * @throws Exception | |
| - */ | |
| - public void testHugeFile() throws Exception { | |
| - final File file = getHugeFile(); | |
| + /** | |
| + * Runs a performance test based on the data created by the | |
| + * ExampleDataCreator utility. | |
| + * | |
| + * @see ExampleDataGenerator | |
| + * @throws Exception | |
| + */ | |
| + public void testHugeFile() throws Exception { | |
| + final File file = getHugeFile(); | |
| - final long timeAtStart = System.currentTimeMillis(); | |
| - System.out.println("time at start: " + timeAtStart); | |
| + final long timeAtStart = System.currentTimeMillis(); | |
| + System.out.println("time at start: " + timeAtStart); | |
| - final DataContext dc = new CsvDataContext(file, new CsvConfiguration()); | |
| - final Table t = dc.getDefaultSchema().getTables()[0]; | |
| + final DataContext dc = new CsvDataContext(file, new CsvConfiguration(1, false, false)); | |
| + final Table t = dc.getDefaultSchema().getTables()[0]; | |
| - final long timeAfterDataContext = System.currentTimeMillis(); | |
| - System.out.println("time after DataContext: " + timeAfterDataContext); | |
| + final long timeAfterDataContext = System.currentTimeMillis(); | |
| + System.out.println("time after DataContext: " + timeAfterDataContext); | |
| - final Query q = new Query().select(t.getColumns()).from(t); | |
| - DataSet ds = dc.executeQuery(q); | |
| + final Query q = new Query().select(t.getColumns()).from(t); | |
| + DataSet ds = dc.executeQuery(q); | |
| - long timeAfterQuery = System.currentTimeMillis(); | |
| - System.out.println("time after query: " + timeAfterQuery); | |
| + long timeAfterQuery = System.currentTimeMillis(); | |
| + System.out.println("time after query: " + timeAfterQuery); | |
| - while (ds.next()) { | |
| - assertEquals(hugeFileCols, ds.getRow().getValues().length); | |
| - } | |
| - ds.close(); | |
| + final CountDownLatch countDown = new CountDownLatch(hugeFileRows); | |
| + final AtomicBoolean success = new AtomicBoolean(true); | |
| - long timeAfterDataSet = System.currentTimeMillis(); | |
| - System.out.println("time after dataSet: " + timeAfterDataSet); | |
| + ExecutorService executorService = Executors.newFixedThreadPool(30); | |
| - if (!file.delete()) { | |
| - file.deleteOnExit(); | |
| - } | |
| - } | |
| + while (ds.next()) { | |
| + final Row row = ds.getRow(); | |
| + executorService.submit(new Runnable() { | |
| + @Override | |
| + public void run() { | |
| + if (hugeFileCols != row.getValues().length) { | |
| + System.out.println("Weird row: " + row); | |
| + success.set(false); | |
| + } | |
| + countDown.countDown(); | |
| + } | |
| + }); | |
| + } | |
| + ds.close(); | |
| - public void testApproximatedCountHugeFile() throws Exception { | |
| - DataContext dc = new CsvDataContext(getHugeFile()); | |
| + countDown.await(); | |
| + assertTrue(success.get()); | |
| - Table table = dc.getDefaultSchema().getTables()[0]; | |
| - Query q = dc.query().from(table).selectCount().toQuery(); | |
| - SelectItem selectItem = q.getSelectClause().getItem(0); | |
| - selectItem.setFunctionApproximationAllowed(true); | |
| + executorService.shutdown(); | |
| - DataSet ds = dc.executeQuery(q); | |
| - assertTrue(ds.next()); | |
| - Object[] values = ds.getRow().getValues(); | |
| - assertEquals(1, values.length); | |
| - assertEquals(3332, ((Long) ds.getRow().getValue(selectItem)).intValue()); | |
| - assertEquals(3332, ((Long) values[0]).intValue()); | |
| - assertFalse(ds.next()); | |
| - } | |
| + long timeAfterDataSet = System.currentTimeMillis(); | |
| + System.out.println("time after dataSet: " + timeAfterDataSet); | |
| + | |
| + long totalTime = timeAfterDataSet - timeAfterDataContext; | |
| + System.out.println("Total time to process large file: " + totalTime + " millis"); | |
| + | |
| + // results with old impl: [13908, 13827, 14577] | |
| + | |
| + // results with new impl: [8567, 8965, 8154] | |
| + | |
| + if (!file.delete()) { | |
| + file.deleteOnExit(); | |
| + } | |
| + } | |
| + | |
| + public void testApproximatedCountHugeFile() throws Exception { | |
| + DataContext dc = new CsvDataContext(getHugeFile()); | |
| + | |
| + Table table = dc.getDefaultSchema().getTables()[0]; | |
| + Query q = dc.query().from(table).selectCount().toQuery(); | |
| + SelectItem selectItem = q.getSelectClause().getItem(0); | |
| + selectItem.setFunctionApproximationAllowed(true); | |
| + | |
| + DataSet ds = dc.executeQuery(q); | |
| + assertTrue(ds.next()); | |
| + Object[] values = ds.getRow().getValues(); | |
| + assertEquals(1, values.length); | |
| + assertEquals(3332, ((Long) ds.getRow().getValue(selectItem)).intValue()); | |
| + assertEquals(3332, ((Long) values[0]).intValue()); | |
| + assertFalse(ds.next()); | |
| + } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment