Skip to content
Commits on Source (8)
......@@ -12,6 +12,8 @@ External-Memory Sorting in Java: useful to sort very large files using multiple
The versions 0.1 of the library are compatible with Java 6 and above. Versions 0.2 and above
require at least Java 8.
This code is used in [Apache Jackrabbit Oak](https://github.com/apache/jackrabbit-oak).
Code sample
------------
......
libexternalsortinginjava-java (0.2.4-1) UNRELEASED; urgency=medium
* New upstream version
* Standards-Version: 4.1.4
* Point Vcs-fields to Salsa
* debhelper 11
TODO: build fails
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-javadoc-plugin:3.0.0:jar (default-cli) on project externalsortinginjava: Execution default-cli of goal org.apache.maven.plugins:maven-javadoc-plugin:3.0.0:jar failed.: NullPointerException -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
-- Andreas Tille <tille@debian.org> Wed, 02 May 2018 08:03:42 +0200
libexternalsortinginjava-java (0.2.3-1) unstable; urgency=medium
* Initial release (Closes: #861939)
......
......@@ -3,17 +3,17 @@ Maintainer: Debian Java Maintainers <pkg-java-maintainers@lists.alioth.debian.or
Uploaders: Andreas Tille <tille@debian.org>
Section: java
Priority: optional
Build-Depends: debhelper (>= 10),
Build-Depends: debhelper (>= 11~),
default-jdk,
maven-debian-helper (>= 2.1)
Build-Depends-Indep: libmaven-dependency-plugin-java,
junit4,
default-jdk-doc,
libmaven-javadoc-plugin-java
Standards-Version: 3.9.8
Vcs-Browser: https://anonscm.debian.org/cgit/pkg-java/libexternalsortinginjava-java.git
Vcs-Git: https://anonscm.debian.org/pkg-java/libexternalsortinginjava-java.git
Homepage: http://github.com/lemire/externalsortinginjava/
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/java-team/libexternalsortinginjava-java
Vcs-Git: https://salsa.debian.org/java-team/libexternalsortinginjava-java.git
HHomepage: http://github.com/lemire/externalsortinginjava/
Package: libexternalsortinginjava-java
Architecture: all
......
......@@ -4,15 +4,15 @@ Description: Skip test requiring inpackaged class jamm
--- a/src/test/java/com/google/code/externalsorting/ExternalSortTest.java
+++ b/src/test/java/com/google/code/externalsorting/ExternalSortTest.java
@@ -21,7 +21,6 @@ import java.util.Scanner;
import org.junit.After;
@@ -29,7 +29,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
-import org.github.jamm.*;
/**
* Unit test for simple App.
@@ -131,25 +130,6 @@ public class ExternalSortTest {
@@ -139,25 +138,6 @@ public class ExternalSortTest {
System.out.println("[performance] String size estimator uses "+bestdiff * 1.0 / N + " ns per string");
}
......
......@@ -3,7 +3,7 @@
<groupId>com.google.code.externalsortinginjava</groupId>
<artifactId>externalsortinginjava</artifactId>
<packaging>jar</packaging>
<version>0.2.3</version>
<version>0.2.4</version>
<name>externalsortinginjava</name>
<url>http://github.com/lemire/externalsortinginjava/</url>
<description>Sometimes, you want to sort large file without first loading them into memory. The solution is to use External Sorting. You divide the files into small blocks, sort each block in RAM, and then merge the result.
......@@ -198,6 +198,6 @@
<connection>scm:git:git@github.com:lemire/externalsortinginjava.git</connection>
<url>scm:git:git@github.com:lemire/externalsortinginjava.git</url>
<developerConnection>scm:git:git@github.com:lemire/externalsortinginjava.git</developerConnection>
<tag>externalsortinginjava-0.2.3</tag>
<tag>externalsortinginjava-0.2.4</tag>
</scm>
</project>
......@@ -210,7 +210,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
*
*/
public static int mergeSortedFiles(BufferedWriter fbw,
public static long mergeSortedFiles(BufferedWriter fbw,
final Comparator<String> cmp, boolean distinct,
List<BinaryFileBuffer> buffers) throws IOException {
PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<>(
......@@ -226,7 +226,7 @@ public class ExternalSort {
pq.add(bfb);
}
}
int rowcounter = 0;
long rowcounter = 0;
try {
if (!distinct) {
while (pq.size() > 0) {
......@@ -290,7 +290,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile)
public static long mergeSortedFiles(List<File> files, File outputfile)
throws IOException {
return mergeSortedFiles(files, outputfile, defaultcomparator,
Charset.defaultCharset());
......@@ -306,7 +306,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp) throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset());
......@@ -324,7 +324,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
......@@ -343,7 +343,7 @@ public class ExternalSort {
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs) throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, false);
}
......@@ -363,7 +363,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
* @since v0.1.2
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, distinct,
......@@ -389,7 +389,7 @@ public class ExternalSort {
* @throws IOException generic IO exception
* @since v0.1.4
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
public static long mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct,
boolean append, boolean usegzip) throws IOException {
ArrayList<BinaryFileBuffer> bfbs = new ArrayList<>();
......@@ -412,7 +412,7 @@ public class ExternalSort {
}
BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputfile, append), cs));
int rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
for (File f : files) {
f.delete();
}
......
......@@ -13,13 +13,21 @@ import java.io.FileReader;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.github.jamm.*;
......@@ -407,4 +415,47 @@ public class ExternalSortTest {
}
}
/**
* Sort a text file with lines greater than {@link Integer#MAX_VALUE}.
*
* @throws IOException
*/
@Ignore("This test takes too long to execute")
@Test
public void sortVeryLargeFile() throws IOException {
final Path veryLargeFile = getTestFile();
final Path outputFile = Files.createTempFile("Merged-File", ".tmp");
final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
final long expectedLines = 2148L * 1000000L;
assertEquals(expectedLines, sortedLines);
}
/**
* Generate a test file with 2148 million lines.
*
* @throws IOException
*/
private Path getTestFile() throws IOException {
System.out.println("Temp File Creation: Started");
final Path path = Files.createTempFile("IntegrationTestFile", ".txt");
final List<String> idList = new ArrayList<>();
final int saneLimit = 1000000;
IntStream.range(0, saneLimit)
.forEach(i -> idList.add("A"));
final String content = idList.stream().collect(Collectors.joining("\n"));
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.TRUNCATE_EXISTING);
final String newLine = "\n";
IntStream.range(1, 2148)
.forEach(i -> {
try {
Files.write(path, newLine.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
});
System.out.println("Temp File Creation: Finished");
return path;
}
}