Skip to content
Commits on Source (4)
set(PACKAGE osm2pgsql)
set(PACKAGE_NAME osm2pgsql)
set(PACKAGE_VERSION 1.0.0)
set(PACKAGE_VERSION 1.2.0)
cmake_minimum_required(VERSION 2.8.7)
......
......@@ -28,7 +28,11 @@ db_copy_thread_t::~db_copy_thread_t() { finish(); }
void db_copy_thread_t::add_buffer(std::unique_ptr<db_cmd_t> &&buffer)
{
assert(m_worker.joinable()); // thread must not have been finished
std::unique_lock<std::mutex> lock(m_queue_mutex);
m_queue_full_cond.wait(lock,
[&]{ return m_worker_queue.size() < db_cmd_copy_t::Max_buffers; });
m_worker_queue.push_back(std::move(buffer));
m_queue_cond.notify_one();
}
......@@ -60,13 +64,11 @@ void db_copy_thread_t::worker_thread()
std::unique_ptr<db_cmd_t> item;
{
std::unique_lock<std::mutex> lock(m_queue_mutex);
if (m_worker_queue.empty()) {
m_queue_cond.wait(lock);
continue;
}
m_queue_cond.wait(lock, [&]{ return !m_worker_queue.empty(); });
item = std::move(m_worker_queue.front());
m_worker_queue.pop_front();
m_queue_full_cond.notify_one();
}
switch (item->type) {
......
......@@ -67,7 +67,22 @@ protected:
struct db_cmd_copy_t : public db_cmd_t
{
enum { Max_buf_size = 10 * 1024 * 1024 };
enum {
/** Size of a single buffer with COPY data for Postgresql.
* This is a trade-off between memory usage and sending large chunks
* to speed up processing. Currently a one-size fits all value.
* Needs more testing and individual values per queue.
*/
Max_buf_size = 10 * 1024 * 1024,
/** Maximum length of the queue with COPY data.
* In the usual case, PostgreSQL should be faster processing the
* data than it can be produced and there should only be one element
* in the queue. If PostgreSQL is slower, then the queue will always
* be full and it is better to keep the queue smaller to reduce memory
* usage. Current value is just assumed to be a reasonable trade off.
*/
Max_buffers = 10
};
/// Name of the target table for the copy operation
std::shared_ptr<db_target_descr_t> target;
/// Vector with object to delete before copying
......@@ -141,6 +156,7 @@ private:
std::thread m_worker;
std::mutex m_queue_mutex;
std::condition_variable m_queue_cond;
std::condition_variable m_queue_full_cond;
std::deque<std::unique_ptr<db_cmd_t>> m_worker_queue;
// Target for copy operation currently ongoing.
......
osm2pgsql (1.0.0+ds-2) UNRELEASED; urgency=medium
osm2pgsql (1.2.0+ds-1) unstable; urgency=medium
* New upstream release.
* Don't define ACCEPT_USE_OF_DEPRECATED_PROJ_API_H, fixed in libosmium.
* Bump Standards-Version to 4.4.1, no changes.
-- Bas Couwenberg <sebastic@debian.org> Fri, 30 Aug 2019 15:10:29 +0200
-- Bas Couwenberg <sebastic@debian.org> Tue, 22 Oct 2019 06:01:22 +0200
osm2pgsql (1.0.0+ds-1) unstable; urgency=medium
......
......@@ -112,9 +112,11 @@ imported into database columns and which tags get dropped. Defaults to /usr/shar
\fB\-C\fR|\-\-cache num
Only for slim mode: Use up to num many MB of RAM for caching nodes. Giving osm2pgsql sufficient cache
to store all imported nodes typically greatly increases the speed of the import. Each cached node
requires 8 bytes of cache, plus about 10% \- 30% overhead. For a current OSM full planet import with
its ~ 3 billion nodes, a good value would be 27000 if you have enough RAM. If you don't have enough
RAM, it is likely beneficial to give osm2pgsql close to the full available amount of RAM. Defaults to 800.
requires 8 bytes of cache, plus about 10% \- 30% overhead. As a rule of thumb,
give a bit more than the size of the import file in PBF format. If the RAM is not
big enough, use about 75% of memory. Make sure to leave enough RAM for PostgreSQL.
It needs at least the amount of `shared_buffers` given in its configuration.
Defaults to 800.
.TP
\fB\ \fR\-\-cache\-strategy strategy
There are a number of different modes in which osm2pgsql can organize its
......
......@@ -25,11 +25,15 @@ use them.
Performance is heavily influenced by other options, but there are some options
that only impact performance.
* ``--cache`` specifies how much memory to allocate for caching information. In
``--slim`` mode, this is just node positions while in non-slim it has to
store information about ways and relations too. The maximum RAM it is useful
to set this to in slim mode is 8 bytes * number of nodes / efficiency, where
efficiency ranges from 50% on small imports to 80% for a planet.
* ``--cache`` specifies how much memory in MB to allocate for caching information.
In ``--slim`` mode, this is just node positions while in non-slim it has to
store information about ways and relations too. The rule of thumb in slim mode
is as follows: use the size of the PBF file you are trying to import or about
75% of RAM, whatever is smaller. Make sure there is enough RAM left for
PostgreSQL. It needs at least the amount of `shared_buffers` given in its
configuration. You may also set ``--cache`` to 0 to disable node caching
completely. This makes only sense when a flat node file is given and there
is not enough RAM to fit most of the cache.
* ``--number-processes`` sets the number of processes to use. This should
typically be set to the number of CPU threads, but gains in speed are minimal
......
......@@ -8,6 +8,7 @@
#define basename /*SKIP IT*/
#endif
#include <boost/format.hpp>
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <osmium/version.hpp>
......@@ -119,7 +120,7 @@ namespace
\n\
Database options:\n\
-d|--database The name of the PostgreSQL database to connect to.\n\
-U|--username PostgreSQL user name (specify passsword in PGPASS\n\
-U|--username PostgreSQL user name (specify passsword in PGPASSWORD\n\
environment variable or use -W).\n\
-W|--password Force password prompt.\n\
-H|--host Database server host name or socket location.\n\
......@@ -227,7 +228,8 @@ namespace
printf(" %s -c -d gis --slim -C <cache size> -k \\\n", name);
printf(" --flat-nodes <flat nodes> planet-latest.osm.pbf\n");
printf("where\n");
printf(" <cache size> is 50000 on machines with 64GB or more RAM \n");
printf(" <cache size> should be equivalent to the size of the \n");
printf(" pbf file to be imported if there is enough RAM \n");
printf(" or about 75%% of memory in MB on machines with less\n");
printf(" <flat nodes> is a location where a 50+GB file can be saved.\n");
printf("\n");
......@@ -299,7 +301,7 @@ options_t::options_t()
output_backend("pgsql"), input_reader("auto"), bbox(boost::none),
extra_attributes(false), verbose(false)
{
num_procs = std::thread::hardware_concurrency();
num_procs = (int) std::min(4U, std::thread::hardware_concurrency());
if (num_procs < 1) {
fprintf(stderr, "WARNING: unable to detect number of hardware threads supported!\n");
num_procs = 1;
......
......@@ -457,12 +457,21 @@ void table_t::escape_type(const string &value, ColumnType flags)
switch (flags) {
case COLUMN_TYPE_INT: {
// For integers we take the first number, or the average if it's a-b
long from, to;
int items = sscanf(value.c_str(), "%ld-%ld", &from, &to);
if (items == 1) {
long long from, to;
// limit number of digits parsed to avoid undefined behaviour in sscanf
int items = sscanf(value.c_str(), "%18lld-%18lld", &from, &to);
if (items == 1 && from <= std::numeric_limits<int32_t>::max() &&
from >= std::numeric_limits<int32_t>::min()) {
m_copy.add_column(from);
} else if (items == 2) {
m_copy.add_column((from + to) / 2);
// calculate mean while avoiding overflows
int64_t mean = (from / 2) + (to / 2) + ((from % 2 + to % 2) / 2);
if (mean <= std::numeric_limits<int32_t>::max() &&
mean >= std::numeric_limits<int32_t>::min()) {
m_copy.add_column(mean);
} else {
m_copy.add_null_column();
}
} else {
m_copy.add_null_column();
}
......
......@@ -21,6 +21,7 @@ set(TESTS
test-output-multi-tags.cpp
test-output-pgsql-area.cpp
test-output-pgsql-schema.cpp
test-output-pgsql-int4.cpp
test-output-pgsql-tablespace.cpp
test-output-pgsql-validgeom.cpp
test-output-pgsql-z_order.cpp
......
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cassert>
#include <sstream>
#include <stdexcept>
#include <memory>
#include "osmtypes.hpp"
#include "osmdata.hpp"
#include "output-pgsql.hpp"
#include "options.hpp"
#include "middle-pgsql.hpp"
#include "middle-ram.hpp"
#include "taginfo_impl.hpp"
#include <sys/types.h>
#include <unistd.h>
#include <boost/lexical_cast.hpp>
#include "tests/middle-tests.hpp"
#include "tests/common-pg.hpp"
#include "tests/common.hpp"
namespace {
struct skip_test : public std::exception {
const char *what() const noexcept { return "Test skipped."; }
};
void run_test(const char* test_name, void (*testfunc)()) {
try {
fprintf(stderr, "%s\n", test_name);
testfunc();
} catch (const skip_test &) {
exit(77); // <-- code to skip this test.
} catch (const std::exception& e) {
fprintf(stderr, "%s\n", e.what());
fprintf(stderr, "FAIL\n");
exit(EXIT_FAILURE);
}
fprintf(stderr, "PASS\n");
}
#define RUN_TEST(x) run_test(#x, &(x))
// "simple" test modeled on the basic regression test from
// the python script. this is just to check everything is
// working as expected before we start the complex stuff.
void test_int4() {
std::unique_ptr<pg::tempdb> db;
try {
db.reset(new pg::tempdb);
} catch (const std::exception &e) {
std::cerr << "Unable to setup database: " << e.what() << "\n";
throw skip_test();
}
std::string proc_name("test-output-pgsql-int4"), input_file("-");
char *argv[] = { &proc_name[0], &input_file[0], nullptr };
options_t options = options_t(2, argv);
options.database_options = db->database_options;
options.num_procs = 1;
options.slim = 1;
options.prefix = "osm2pgsql_test";
options.style = "tests/test_output_pgsql_int4.style";
testing::run_osm2pgsql(options, "tests/test_output_pgsql_int4.osm",
"xml");
db->assert_has_table("osm2pgsql_test_point");
db->assert_has_table("osm2pgsql_test_line");
db->assert_has_table("osm2pgsql_test_polygon");
db->assert_has_table("osm2pgsql_test_roads");
// First three nodes have population values that are out of range for int4 columns
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 1");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 2");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 3");
// Check values that are valid for int4 columns, including limits
db->check_count(2147483647, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 4");
db->check_count(10000, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 5");
db->check_count(-10000, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 6");
db->check_count(-2147483648, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 7");
// More out of range negative values
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 8");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 9");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 10");
// Ranges are also parsed into int4 columns
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 11");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 12");
// Check values that are valid for int4 columns, including limits
db->check_count(2147483647, "SELECT population FROM osm2pgsql_test_point WHERE osm_id =13");
db->check_count(15000, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 14");
db->check_count(-15000, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 15");
db->check_count(-2147483648, "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 16");
// More out of range negative values
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 17");
db->check_string("", "SELECT population FROM osm2pgsql_test_point WHERE osm_id = 18");
}
} // anonymous namespace
int main(int argc, char *argv[]) {
(void)argc;
(void)argv;
RUN_TEST(test_int4);
return 0;
}
<?xml version="1.0" encoding="UTF-8"?>
<osm version="0.6">
<node id="1" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000000000000000000" />
<tag k="name" v="longer than long" />
</node>
<node id="2" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000000000" />
<tag k="name" v="long (ten billion)" />
</node>
<node id="3" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="2147483648" />
<tag k="name" v="postgresql one more than int4 type" />
</node>
<node id="4" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="2147483647" />
<tag k="name" v="postgresql max int4 type" />
</node>
<node id="5" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000" />
<tag k="name" v="ten thousand" />
</node>
<node id="6" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000" />
<tag k="name" v="minus ten thousand" />
</node>
<node id="7" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-2147483648" />
<tag k="name" v="postgresql min int4 type" />
</node>
<node id="8" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-2147483649" />
<tag k="name" v="postgresql one less than min int4 type" />
</node>
<node id="9" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000000000" />
<tag k="name" v="minus long (minus ten billion)" />
</node>
<node id="10" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000000000000000000" />
<tag k="name" v="minus longer than long" />
</node>
<node id="11" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000000000000000000-20000000000000000000" />
<tag k="name" v="range, longer than long" />
</node>
<node id="12" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000000000-20000000000" />
<tag k="name" v="range, 15 billion" />
</node>
<node id="13" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="2147483646-2147483648" />
<tag k="name" v="range, mean is max int4" />
</node>
<node id="14" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="10000-20000" />
<tag k="name" v="range, 15 thousand" />
</node>
<node id="15" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000--20000" />
<tag k="name" v="range, negative 15 thousand" />
</node>
<node id="16" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-2147483647--2147483649" />
<tag k="name" v="range, mean is min int4" />
</node>
<node id="17" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000000000--20000000000" />
<tag k="name" v="range, negative 15 billion" />
</node>
<node id="18" visible="true" version="1" changeset="1" timestamp="2018-10-31T10:20:19Z" user="a" uid="1" lat="51.4779481" lon="-0.0014863">
<tag k="population" v="-10000000000000000000--20000000000000000000" />
<tag k="name" v="range, negative longer than long" />
</node>
</osm>
# OsmType Tag DataType Flags
node,way population int4 linear
node,way name text linear