Commit 704dade1 authored by Kunal Mehta's avatar Kunal Mehta

New upstream version 5.0.0

parent f6c69953
libzim 5.0.0
============
* Fix thread slipping for win32 crosscompilation.
* Fix a potential invalid access when reading dirent.
* Fix memory leak in the decompression algorithm.
* [Writer] Fix a memory leak (cluster cleanning)
* [Writer] Write article data in a temporary cluster file instead of a
temporary file per article.
* [Writer] Better algorithm to store the dirent while creating the zim
file. Better memory usage.
* [Writer] [API Change] Url/Ns are now handle using the same struct Url.
* [Writer] [API Change] No more aid and redirectAid. A redirectArticle
have to implement redirectUrl.
* [Writer] Use a memory pool to avoid multiple small memory allocations.
* [Writer] [API Change] Rename `ZimCreator` to `Creator`.
* [API Change] File's `search` and `suggestions` now return a unique_ptr
instead of a raw pointer.
libzim 4.0.7
============
......
......@@ -20,7 +20,7 @@
#include <iostream>
#include <sstream>
#include <vector>
#include <zim/writer/zimcreator.h>
#include <zim/writer/creator.h>
#include <zim/blob.h>
class TestArticle : public zim::writer::Article
......@@ -34,13 +34,12 @@ class TestArticle : public zim::writer::Article
virtual ~TestArticle() = default;
virtual std::string getAid() const;
virtual char getNamespace() const;
virtual std::string getUrl() const;
virtual zim::writer::Url getUrl() const;
virtual std::string getTitle() const;
virtual bool isRedirect() const;
virtual bool shouldCompress() const { return true; }
virtual std::string getMimeType() const;
virtual std::string getRedirectAid() const;
virtual zim::writer::Url getRedirectUrl() const;
virtual bool shouldIndex() const { return false; }
virtual zim::size_type getSize() const { return _data.size(); }
virtual std::string getFilename() const { return ""; }
......@@ -62,14 +61,9 @@ std::string TestArticle::getAid() const
return _id;
}
char TestArticle::getNamespace() const
zim::writer::Url TestArticle::getUrl() const
{
return 'A';
}
std::string TestArticle::getUrl() const
{
return _id;
return zim::writer::Url('A', _id);
}
std::string TestArticle::getTitle() const
......@@ -87,9 +81,9 @@ std::string TestArticle::getMimeType() const
return "text/plain";
}
std::string TestArticle::getRedirectAid() const
zim::writer::Url TestArticle::getRedirectUrl() const
{
return "";
return zim::writer::Url();
}
int main(int argc, char* argv[])
......@@ -105,7 +99,7 @@ int main(int argc, char* argv[])
}
try
{
zim::writer::ZimCreator c;
zim::writer::Creator c;
c.startZimCreation("foo.zim");
for (auto& article:_articles)
{
......
......@@ -16,7 +16,8 @@ install_headers(
install_headers(
'zim/writer/article.h',
'zim/writer/zimcreator.h',
'zim/writer/url.h',
'zim/writer/creator.h',
subdir:'zim/writer'
)
......@@ -81,8 +81,8 @@ namespace zim
const_iterator find(char ns, const std::string& url) const;
const_iterator find(const std::string& url) const;
const Search* search(const std::string& query, int start, int end) const;
const Search* suggestions(const std::string& query, int start, int end) const;
std::unique_ptr<Search> search(const std::string& query, int start, int end) const;
std::unique_ptr<Search> suggestions(const std::string& query, int start, int end) const;
time_t getMTime() const;
......
......@@ -24,6 +24,7 @@
#include <zim/blob.h>
#include <zim/zim.h>
#include <zim/uuid.h>
#include <zim/writer/url.h>
#include <string>
namespace zim
......@@ -34,9 +35,7 @@ namespace zim
class Article
{
public:
virtual std::string getAid() const = 0;
virtual char getNamespace() const = 0;
virtual std::string getUrl() const = 0;
virtual Url getUrl() const = 0;
virtual std::string getTitle() const = 0;
virtual bool isRedirect() const = 0;
virtual bool isLinktarget() const;
......@@ -44,8 +43,7 @@ namespace zim
virtual std::string getMimeType() const = 0;
virtual bool shouldCompress() const = 0;
virtual bool shouldIndex() const = 0;
virtual std::string getRedirectAid() const = 0;
virtual std::string getParameter() const;
virtual Url getRedirectUrl() const = 0;
virtual zim::size_type getSize() const = 0;
virtual Blob getData() const = 0;
virtual std::string getFilename() const = 0;
......
......@@ -17,8 +17,8 @@
*
*/
#ifndef ZIM_WRITER_ZIMCREATOR_H
#define ZIM_WRITER_ZIMCREATOR_H
#ifndef ZIM_WRITER_CREATOR_H
#define ZIM_WRITER_CREATOR_H
#include <memory>
#include <zim/zim.h>
......@@ -29,12 +29,12 @@ namespace zim
class Fileheader;
namespace writer
{
class ZimCreatorData;
class ZimCreator
class CreatorData;
class Creator
{
public:
ZimCreator(bool verbose = false);
virtual ~ZimCreator();
Creator(bool verbose = false);
virtual ~Creator();
zim::size_type getMinChunkSize() const { return minChunkSize; }
void setMinChunkSize(zim::size_type s) { minChunkSize = s; }
......@@ -46,12 +46,12 @@ namespace zim
virtual void addArticle(const Article& article);
virtual void finishZimCreation();
virtual std::string getMainPage() { return ""; }
virtual std::string getLayoutPage() { return ""; }
virtual Url getMainUrl() { return Url(); }
virtual Url getLayoutUrl() { return Url(); }
virtual zim::Uuid getUuid() { return Uuid::generate(); }
private:
std::unique_ptr<ZimCreatorData> data;
std::unique_ptr<CreatorData> data;
bool verbose;
bool withIndex = false;
size_t minChunkSize = 1024-64;
......@@ -66,4 +66,4 @@ namespace zim
}
#endif // ZIM_WRITER_ZIMCREATOR_H
#endif // ZIM_WRITER_CREATOR_H
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_WRITER_URL_H
#define ZIM_WRITER_URL_H
#include <string>
namespace zim
{
namespace writer
{
class Url {
public:
Url() :
url(),
ns(0)
{}
Url(char ns, std::string url) :
url(url),
ns(ns)
{}
Url(std::string url) :
url(url.substr(2)),
ns(url[0])
{}
char getNs() const { return ns; }
const std::string& getUrl() const { return url; }
std::string getLongUrl() const { return std::string(1, ns) + '/' + url; }
bool empty() const { return ns == 0 && url.empty(); }
private:
std::string url;
char ns;
friend bool operator< (const Url& lhs, const Url& rhs);
friend bool operator== (const Url& lhs, const Url& rhs);
};
inline bool operator< (const Url& lhs, const Url& rhs) {
return lhs.ns < rhs.ns
|| (lhs.ns == rhs.ns && lhs.url < rhs.url);
}
inline bool operator== (const Url& lhs, const Url& rhs) {
return lhs.ns == rhs.ns && lhs.url == rhs.url;
}
}
}
#endif // ZIM_WRITER_URL_H
project('libzim', ['c', 'cpp'],
version : '4.0.7',
version : '5.0.0',
license : 'GPL2',
default_options : ['c_std=c11', 'cpp_std=c++11', 'werror=true'])
......
......@@ -85,14 +85,20 @@ namespace zim
log_debug("read url, title and parameters");
offset_type url_size = strlen(buffer->data(current));
offset_type url_size = strnlen(
buffer->data(current),
buffer->size().v - current.v - extraLen
);
if (current.v + url_size >= buffer->size().v) {
throw(InvalidSize());
}
url = std::string(buffer->data(current), url_size);
current += url_size + 1;
offset_type title_size = strlen(buffer->data(current));
offset_type title_size = strnlen(
buffer->data(current),
buffer->size().v - current.v - extraLen
);
if (current.v + title_size >= buffer->size().v) {
throw(InvalidSize());
}
......
......@@ -176,15 +176,15 @@ namespace zim
return File::const_iterator(this, article_index_type(r.second), const_iterator::ArticleIterator);
}
const Search* File::search(const std::string& query, int start, int end) const {
Search* search = new Search(this);
std::unique_ptr<Search> File::search(const std::string& query, int start, int end) const {
auto search = std::unique_ptr<Search>(new Search(this));
search->set_query(query);
search->set_range(start, end);
return search;
}
const Search* File::suggestions(const std::string& query, int start, int end) const {
Search* search = new Search(this);
std::unique_ptr<Search> File::suggestions(const std::string& query, int start, int end) const {
auto search = std::unique_ptr<Search>(new Search(this));
search->set_query(query);
search->set_range(start, end);
search->set_suggestion_mode(true);
......
......@@ -246,17 +246,17 @@ char* uncompress(const Reader* reader, offset_t startOffset, zsize_t* dest_size)
char* ret_data = new char[_dest_size.v];
// The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk
// is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE.
char* raw_data = new char[CHUNCK_SIZE];
std::vector<char> raw_data(CHUNCK_SIZE);
typename INFO::stream_t stream;
INFO::init_stream(&stream, raw_data);
INFO::init_stream(&stream, raw_data.data());
zim::size_type availableSize = reader->size().v - startOffset.v;
zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE);
reader->read(raw_data, startOffset, zsize_t(inputSize));
reader->read(raw_data.data(), startOffset, zsize_t(inputSize));
startOffset.v += inputSize;
availableSize -= inputSize;
stream.next_in = (unsigned char*)raw_data;
stream.next_in = (unsigned char*)raw_data.data();
stream.avail_in = inputSize;
stream.next_out = (unsigned char*) ret_data;
stream.avail_out = _dest_size.v;
......@@ -271,10 +271,10 @@ char* uncompress(const Reader* reader, offset_t startOffset, zsize_t* dest_size)
// So, we must fetch a new chunk of input data.
if (availableSize) {
inputSize = std::min(availableSize, CHUNCK_SIZE);
reader->read(raw_data, startOffset, zsize_t(inputSize));
reader->read(raw_data.data(), startOffset, zsize_t(inputSize));
startOffset.v += inputSize;
availableSize -= inputSize;
stream.next_in = (unsigned char*) raw_data;
stream.next_in = (unsigned char*) raw_data.data();
stream.avail_in = inputSize;
continue;
}
......
......@@ -26,7 +26,7 @@ common_sources = [
'uuid.cpp',
'levenshtein.cpp',
'tools.cpp',
'writer/zimcreator.cpp',
'writer/creator.cpp',
'writer/lzmastream.cpp',
'writer/article.cpp',
'writer/cluster.cpp',
......
......@@ -43,6 +43,13 @@
# define SEPARATOR "/"
#endif
#ifdef __MINGW32__
# include <time.h>
#else
# include <thread>
# include <chrono>
#endif
std::string zim::removeAccents(const std::string& text)
{
......@@ -56,3 +63,15 @@ std::string zim::removeAccents(const std::string& text)
ustring.toUTF8String(unaccentedText);
return unaccentedText;
}
void zim::microsleep(int microseconds) {
#ifdef __MINGW32__
struct timespec wait = {0, 0};
wait.tv_sec = microseconds / 1000000;
wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000;
nanosleep(&wait, nullptr);
#else
std::this_thread::sleep_for(std::chrono::microseconds(microseconds));
#endif
}
......@@ -26,7 +26,7 @@
namespace zim {
std::string removeAccents(const std::string& text);
void microsleep(int microseconds);
}
#endif // OPENZIM_LIBZIM_TOOLS_H
......@@ -20,59 +20,157 @@
#ifndef ZIM_WRITER_DIRENT_H
#define ZIM_WRITER_DIRENT_H
#include "../_dirent.h"
#include "cluster.h"
#include "debug.h"
namespace zim
{
namespace writer
{
class Dirent : public zim::Dirent
namespace writer {
class Dirent;
struct DirectInfo {
DirectInfo() :
clusterNumber(0),
blobNumber(0)
{};
cluster_index_t clusterNumber;
blob_index_t blobNumber;
};
struct RedirectInfo {
const Dirent* redirectDirent = nullptr;
};
union DirentInfo {
DirectInfo d;
RedirectInfo r;
};
class Dirent
{
static const uint16_t redirectMimeType = 0xffff;
static const uint16_t linktargetMimeType = 0xfffe;
static const uint16_t deletedMimeType = 0xfffd;
static const uint32_t version = 0;
uint16_t mimeType;
DirentInfo info {};
Url url;
std::string title;
Cluster* cluster = nullptr;
std::string aid;
std::string redirectAid;
Url redirectUrl;
article_index_t idx = article_index_t(0);
public:
Dirent() {}
Dirent(const std::string& aid_)
: aid(aid_)
{}
Dirent()
: mimeType(0),
url(),
title(),
redirectUrl()
{
info.d.clusterNumber = cluster_index_t(0);
info.d.blobNumber = blob_index_t(0);
}
Dirent(Url url_ )
: Dirent()
{ url = url_; }
char getNamespace() const { return url.getNs(); }
const std::string& getTitle() const { return title.empty() ? url.getUrl() : title; }
void setTitle(const std::string& title_) { title = title_; }
const std::string& getUrl() const { return url.getUrl(); }
const Url& getFullUrl() const { return url; }
void setUrl(Url url_) {
url = url_;
}
uint32_t getVersion() const { return version; }
void setRedirectUrl(Url redirectUrl_) { redirectUrl = redirectUrl_; }
const Url& getRedirectUrl() const { return redirectUrl; }
void setRedirect(const Dirent* target) {
info.r.redirectDirent = target;
mimeType = redirectMimeType;
}
article_index_t getRedirectIndex() const { return isRedirect() ? info.r.redirectDirent->getIdx() : article_index_t(0); }
void setMimeType(uint16_t mime)
{
mimeType = mime;
}
void setLinktarget()
{
ASSERT(mimeType, ==, 0);
mimeType = linktargetMimeType;
}
void setDeleted()
{
ASSERT(mimeType, ==, 0);
mimeType = deletedMimeType;
}
Dirent(char ns, const std::string& url)
{ setUrl(ns, url); }
void setAid(const std::string& aid_) { aid = aid_; }
const std::string& getAid() const { return aid; }
void setRedirectAid(const std::string& aid_) { redirectAid = aid_; }
const std::string& getRedirectAid() const { return redirectAid; }
void setIdx(article_index_t idx_) { idx = idx_; }
article_index_t getIdx() const { return idx; }
void setCluster(zim::writer::Cluster* _cluster)
{ cluster = _cluster; blobNumber = _cluster->count(); }
cluster_index_t getClusterNumber() const { return cluster ? cluster->getClusterIndex() : clusterNumber; }
void setCluster(zim::writer::Cluster* _cluster)
{
ASSERT(isArticle(), ==, true);
cluster = _cluster;
info.d.blobNumber = _cluster->count();
}
cluster_index_t getClusterNumber() const {
return cluster ? cluster->getClusterIndex() : info.d.clusterNumber;
}
blob_index_t getBlobNumber() const {
return isRedirect() ? blob_index_t(0) : info.d.blobNumber;
}
bool isRedirect() const { return mimeType == redirectMimeType; }
bool isLinktarget() const { return mimeType == linktargetMimeType; }
bool isDeleted() const { return mimeType == deletedMimeType; }
bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); }
uint16_t getMimeType() const { return mimeType; }
size_t getDirentSize() const
{
size_t ret = (isRedirect() ? 12 : 16) + url.getUrl().size() + 2;
if (title != url.getUrl())
ret += title.size();
return ret;
}
void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_)
{
ASSERT(mimeType, ==, 0);
mimeType = mimeType_;
info.d.clusterNumber = clusterNumber_;
info.d.blobNumber = blobNumber_;
}
friend bool compareUrl(const Dirent* d1, const Dirent* d2);
friend inline bool compareTitle(const Dirent* d1, const Dirent* d2);
};
std::ostream& operator<< (std::ostream& out, const Dirent& d);
inline bool compareUrl(const Dirent& d1, const Dirent& d2)
inline bool compareUrl(const Dirent* d1, const Dirent* d2)
{
return d1.getNamespace() < d2.getNamespace()
|| (d1.getNamespace() == d2.getNamespace()
&& d1.getUrl() < d2.getUrl());
return d1->url < d2->url;
}
inline bool compareAid(const Dirent& d1, const Dirent& d2)
inline bool compareTitle(const Dirent* d1, const Dirent* d2)
{
return d1.getAid() < d2.getAid();
return d1->url.getNs() < d2->url.getNs()
|| (d1->url.getNs() == d2->url.getNs() && d1->getTitle() < d2->getTitle());
}
std::ostream& operator<< (std::ostream& out, const Dirent& d);
}
}
......
......@@ -33,10 +33,6 @@ namespace zim
{
return false;
}
std::string Article::getParameter() const
{
return std::string();
}
std::string Article::getNextCategory()
{
......
......@@ -50,8 +50,8 @@ Cluster::Cluster(CompressionType compression)
}
void Cluster::clear() {
offsets.clear();
_data.clear();
Offsets().swap(offsets);
ClusterData().swap(_data);
}
void Cluster::close() {
......@@ -100,13 +100,8 @@ void Cluster::write_offsets(std::ostream& out) const
void Cluster::write_final(std::ostream& out) const
{
if(getCompression() == zim::zimcompNone)
{
dump(out);
} else {
std::ifstream clustersFile(tmp_filename, std::ios::binary);
out << clustersFile.rdbuf();
}
std::ifstream clustersFile(tmp_filename, std::ios::binary);
out << clustersFile.rdbuf();
if (!out) {
throw std::runtime_error("failed to write cluster");
}
......@@ -114,46 +109,18 @@ void Cluster::write_final(std::ostream& out) const
void Cluster::dump_tmp(const std::string& directoryPath)
{
if(getCompression() == zim::zimcompNone)
{
//No real dump, store inmemory data in file
size_t file_index = 0;
for (auto& data: _data)
{
ASSERT(data.value.empty(), ==, false);
if (data.type == DataType::plain) {
std::ostringstream ss;
ss << directoryPath << SEPARATOR << "file_" << index << "_" << file_index << ".tmp";
auto filename = ss.str();
{
std::ofstream out(filename, std::ios::binary);
out << data.value;
if (!out) {
throw std::runtime_error(
std::string("failed to write temporary cluster file ")
+ filename);
}
}
data.type = DataType::file;
data.value = filename;
}
file_index++;
}
finalSize = zsize_t(size().v+1);
} else {
std::ostringstream ss;
ss << directoryPath << SEPARATOR << "cluster_" << index << ".clt";
tmp_filename = ss.str();
std::ofstream out(tmp_filename, std::ios::binary);
dump(out);
if (!out) {
throw std::runtime_error(
std::string("failed to write temporary cluster file ")
+ tmp_filename);
}
finalSize = zsize_t(out.tellp());
clear();
std::ostringstream ss;
ss << directoryPath << SEPARATOR << "cluster_" << index << ".clt";
tmp_filename = ss.str();
std::ofstream out(tmp_filename, std::ios::binary);
dump(out);
if (!out) {
throw std::runtime_error(
std::string("failed to write temporary cluster file ")
+ tmp_filename);
}
finalSize = zsize_t(out.tellp());
clear();
}
void Cluster::write(std::ostream& out) const
......
......@@ -17,8 +17,8 @@
*
*/
#ifndef ZIM_WRITER_ZIMCREATOR_DATA_H
#define ZIM_WRITER_ZIMCREATOR_DATA_H
#ifndef ZIM_WRITER_CREATOR_DATA_H
#define ZIM_WRITER_CREATOR_DATA_H
#include <zim/fileheader.h>
#include <zim/writer/article.h>
......@@ -30,6 +30,8 @@
#include <fstream>
#include "config.h"
#include "direntPool.h"
#if defined(ENABLE_XAPIAN)
class XapianIndexer;
#endif
......@@ -38,12 +40,25 @@ namespace zim
{
namespace writer
{
struct UrlCompare {
bool operator() (const Dirent* d1, const Dirent* d2) const {
return compareUrl(d1, d2);
}
};
struct TitleCompare {
bool operator() (const Dirent* d1, const Dirent* d2) const {
return compareTitle(d1, d2);
}
};
class Cluster;
class ZimCreatorData
class CreatorData
{
public:
typedef std::vector<Dirent> DirentsType;
typedef std::vector<article_index_t> ArticleIdxVectorType;
typedef std::set<Dirent*, UrlCompare> UrlSortedDirents;
typedef std::multiset<Dirent*, TitleCompare> TitleSortedDirents;
typedef std::vector<offset_t> OffsetsType;
typedef std::map<std::string, uint16_t> MimeTypesMap;
typedef std::map<uint16_t, std::string> RMimeTypesMap;
......@@ -52,16 +67,15 @@ namespace zim
typedef Queue<Cluster*> ClusterQueue;
typedef std::vector<pthread_t> ThreadList;