Skip to content
Snippets Groups Projects
Commit 51baca56 authored by Andreas Tille's avatar Andreas Tille
Browse files

New upstream version 0.2.4

parent 9f6ea8d9
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,8 @@ External-Memory Sorting in Java: useful to sort very large files using multiple
The versions 0.1 of the library are compatible with Java 6 and above. Versions 0.2 and above
require at least Java 8.
This code is used in [Apache Jackrabbit Oak](https://github.com/apache/jackrabbit-oak).
Code sample
------------
......
......@@ -3,7 +3,7 @@
<groupId>com.google.code.externalsortinginjava</groupId>
<artifactId>externalsortinginjava</artifactId>
<packaging>jar</packaging>
<version>0.2.3</version>
<version>0.2.4</version>
<name>externalsortinginjava</name>
<url>http://github.com/lemire/externalsortinginjava/</url>
<description>Sometimes, you want to sort large file without first loading them into memory. The solution is to use External Sorting. You divide the files into small blocks, sort each block in RAM, and then merge the result.
......@@ -198,6 +198,6 @@
<connection>scm:git:git@github.com:lemire/externalsortinginjava.git</connection>
<url>scm:git:git@github.com:lemire/externalsortinginjava.git</url>
<developerConnection>scm:git:git@github.com:lemire/externalsortinginjava.git</developerConnection>
<tag>externalsortinginjava-0.2.3</tag>
<tag>externalsortinginjava-0.2.4</tag>
</scm>
</project>
......@@ -210,7 +210,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
*
*/
public static int mergeSortedFiles(BufferedWriter fbw,
public static long mergeSortedFiles(BufferedWriter fbw,
final Comparator<String> cmp, boolean distinct,
List<BinaryFileBuffer> buffers) throws IOException {
PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<>(
......@@ -226,7 +226,7 @@ public class ExternalSort {
pq.add(bfb);
}
}
int rowcounter = 0;
long rowcounter = 0;
try {
if (!distinct) {
while (pq.size() > 0) {
......@@ -290,7 +290,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile)
public static long mergeSortedFiles(List<File> files, File outputfile)
throws IOException {
return mergeSortedFiles(files, outputfile, defaultcomparator,
Charset.defaultCharset());
......@@ -306,7 +306,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp) throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset());
......@@ -324,7 +324,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
......@@ -343,7 +343,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs) throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, false);
}
......@@ -363,7 +363,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
* @since v0.1.2
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, distinct,
......@@ -389,7 +389,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
* @since v0.1.4
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct,
boolean append, boolean usegzip) throws IOException {
ArrayList<BinaryFileBuffer> bfbs = new ArrayList<>();
......@@ -412,7 +412,7 @@ public class ExternalSort {
}
BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputfile, append), cs));
int rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
for (File f : files) {
f.delete();
}
......
......@@ -13,13 +13,21 @@ import java.io.FileReader;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.github.jamm.*;
......@@ -407,4 +415,47 @@ public class ExternalSortTest {
}
}
/**
* Sort a text file with lines greater than {@link Integer#MAX_VALUE}.
*
* @throws IOException
*/
@Ignore("This test takes too long to execute")
@Test
public void sortVeryLargeFile() throws IOException {
final Path veryLargeFile = getTestFile();
final Path outputFile = Files.createTempFile("Merged-File", ".tmp");
final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
final long expectedLines = 2148L * 1000000L;
assertEquals(expectedLines, sortedLines);
}
/**
* Generate a test file with 2148 million lines.
*
* @throws IOException
*/
private Path getTestFile() throws IOException {
System.out.println("Temp File Creation: Started");
final Path path = Files.createTempFile("IntegrationTestFile", ".txt");
final List<String> idList = new ArrayList<>();
final int saneLimit = 1000000;
IntStream.range(0, saneLimit)
.forEach(i -> idList.add("A"));
final String content = idList.stream().collect(Collectors.joining("\n"));
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.TRUNCATE_EXISTING);
final String newLine = "\n";
IntStream.range(1, 2148)
.forEach(i -> {
try {
Files.write(path, newLine.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
});
System.out.println("Temp File Creation: Finished");
return path;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment