2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2014-09-25 18:14:01 +00:00
|
|
|
|
2021-03-23 20:47:56 +00:00
|
|
|
#include "db/db_impl/compacted_db_impl.h"
|
2020-07-03 02:24:25 +00:00
|
|
|
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2014-09-25 18:14:01 +00:00
|
|
|
#include "db/version_set.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "logging/logging.h"
|
2014-09-29 18:09:09 +00:00
|
|
|
#include "table/get_context.h"
|
2020-07-03 02:24:25 +00:00
|
|
|
#include "util/cast_util.h"
|
2014-09-25 18:14:01 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2014-09-25 18:14:01 +00:00
|
|
|
|
|
|
|
extern void MarkKeyMayExist(void* arg);
|
|
|
|
extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
|
2014-09-29 18:09:09 +00:00
|
|
|
const Slice& v, bool hit_and_return);
|
2014-09-25 18:14:01 +00:00
|
|
|
|
Make backups openable as read-only DBs (#8142)
Summary:
A current limitation of backups is that you don't know the
exact database state of when the backup was taken. With this new
feature, you can at least inspect the backup's DB state without
restoring it by opening it as a read-only DB.
Rather than add something like OpenAsReadOnlyDB to the BackupEngine API,
which would inhibit opening stackable DB implementations read-only
(if/when their APIs support it), we instead provide a DB name and Env
that can be used to open as a read-only DB.
Possible follow-up work:
* Add a version of GetBackupInfo for a single backup.
* Let CreateNewBackup return the BackupID of the newly-created backup.
Implementation details:
Refactored ChrootFileSystem to split off new base class RemapFileSystem,
which allows more general remapping of files. We use this base class to
implement BackupEngineImpl::RemapSharedFileSystem.
To minimize API impact, I decided to just add these fields `name_for_open`
and `env_for_open` to those set by GetBackupInfo when
include_file_details=true. Creating the RemapSharedFileSystem adds a bit
to the memory consumption, perhaps unnecessarily in some cases, but this
has been mitigated by (a) only initialize the RemapSharedFileSystem
lazily when GetBackupInfo with include_file_details=true is called, and
(b) using the existing `shared_ptr<FileInfo>` objects to hold most of the
mapping data.
To enhance API safety, RemapSharedFileSystem is wrapped by new
ReadOnlyFileSystem which rejects any attempts to write. This uncovered a
couple of places in which DB::OpenForReadOnly would write to the
filesystem, so I fixed these. Added a release note because this affects
logging.
Additional minor refactoring in backupable_db.cc to support the new
functionality.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8142
Test Plan:
new test (run with ASAN and UBSAN), added to stress test and
ran it for a while with amplified backup_one_in
Reviewed By: ajkr
Differential Revision: D27535408
Pulled By: pdillinger
fbshipit-source-id: 04666d310aa0261ef6b2385c43ca793ce1dfd148
2021-04-06 21:36:45 +00:00
|
|
|
CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
|
|
|
|
const std::string& dbname)
|
|
|
|
: DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
|
|
|
|
/*read_only*/ true),
|
|
|
|
cfd_(nullptr),
|
|
|
|
version_(nullptr),
|
|
|
|
user_comparator_(nullptr) {}
|
2014-09-25 18:14:01 +00:00
|
|
|
|
2022-10-25 20:49:09 +00:00
|
|
|
CompactedDBImpl::~CompactedDBImpl() {}
|
2014-09-25 18:14:01 +00:00
|
|
|
|
2014-09-25 20:34:51 +00:00
|
|
|
size_t CompactedDBImpl::FindFile(const Slice& key) {
|
2014-09-25 18:14:01 +00:00
|
|
|
size_t right = files_.num_files - 1;
|
2018-09-27 17:33:04 +00:00
|
|
|
auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
|
|
|
|
return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
|
|
|
|
};
|
2022-10-25 20:49:09 +00:00
|
|
|
return static_cast<size_t>(
|
|
|
|
std::lower_bound(files_.files, files_.files + right, key, cmp) -
|
|
|
|
files_.files);
|
2014-09-25 20:34:51 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 18:44:50 +00:00
|
|
|
Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
|
|
|
|
const Slice& key, PinnableSlice* value) {
|
2022-05-24 19:14:10 +00:00
|
|
|
return Get(options, /*column_family*/ nullptr, key, value,
|
|
|
|
/*timestamp*/ nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
|
|
|
|
const Slice& key, PinnableSlice* value,
|
|
|
|
std::string* timestamp) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(user_comparator_);
|
2022-05-24 19:14:10 +00:00
|
|
|
if (options.timestamp) {
|
2022-06-06 21:36:22 +00:00
|
|
|
const Status s = FailIfTsMismatchCf(
|
|
|
|
DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true);
|
2022-05-24 19:14:10 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
const Status s = FailIfCfHasTs(DefaultColumnFamily());
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2022-06-04 03:00:42 +00:00
|
|
|
|
|
|
|
// Clear the timestamps for returning results so that we can distinguish
|
|
|
|
// between tombstone or key that has never been written
|
|
|
|
if (timestamp) {
|
|
|
|
timestamp->clear();
|
|
|
|
}
|
|
|
|
|
2022-05-24 19:14:10 +00:00
|
|
|
GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
|
|
|
|
std::string* ts =
|
|
|
|
user_comparator_->timestamp_size() > 0 ? timestamp : nullptr;
|
|
|
|
LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
|
2014-09-29 18:09:09 +00:00
|
|
|
GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
GetContext::kNotFound, lkey.user_key(), value,
|
|
|
|
/*columns=*/nullptr, ts, nullptr, nullptr, true,
|
|
|
|
nullptr, nullptr, nullptr, nullptr, &read_cb);
|
2022-05-24 19:14:10 +00:00
|
|
|
|
|
|
|
const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
|
|
|
|
if (user_comparator_->CompareWithoutTimestamp(
|
|
|
|
key, /*a_has_ts=*/false,
|
|
|
|
ExtractUserKeyAndStripTimestamp(f.smallest_key,
|
|
|
|
user_comparator_->timestamp_size()),
|
|
|
|
/*b_has_ts=*/false) < 0) {
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context,
|
|
|
|
nullptr);
|
2020-09-29 16:47:33 +00:00
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-09-29 18:09:09 +00:00
|
|
|
if (get_context.State() == GetContext::kFound) {
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
2022-05-24 19:14:10 +00:00
|
|
|
std::vector<Status> CompactedDBImpl::MultiGet(
|
|
|
|
const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
|
2014-09-25 20:34:51 +00:00
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
2022-05-24 19:14:10 +00:00
|
|
|
return MultiGet(options, keys, values, /*timestamps*/ nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> CompactedDBImpl::MultiGet(
|
|
|
|
const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values,
|
|
|
|
std::vector<std::string>* timestamps) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(user_comparator_);
|
2022-05-24 19:14:10 +00:00
|
|
|
size_t num_keys = keys.size();
|
|
|
|
|
|
|
|
if (options.timestamp) {
|
2022-06-06 21:36:22 +00:00
|
|
|
Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp),
|
|
|
|
/*ts_for_read=*/true);
|
2022-05-24 19:14:10 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return std::vector<Status>(num_keys, s);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Status s = FailIfCfHasTs(DefaultColumnFamily());
|
|
|
|
if (!s.ok()) {
|
|
|
|
return std::vector<Status>(num_keys, s);
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2022-05-24 19:14:10 +00:00
|
|
|
|
2022-06-04 03:00:42 +00:00
|
|
|
// Clear the timestamps for returning results so that we can distinguish
|
|
|
|
// between tombstone or key that has never been written
|
|
|
|
if (timestamps) {
|
|
|
|
for (auto& ts : *timestamps) {
|
|
|
|
ts.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-24 19:14:10 +00:00
|
|
|
GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
|
2014-09-25 20:34:51 +00:00
|
|
|
autovector<TableReader*, 16> reader_list;
|
|
|
|
for (const auto& key : keys) {
|
2022-05-24 19:14:10 +00:00
|
|
|
LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
|
|
|
|
const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
|
|
|
|
if (user_comparator_->CompareWithoutTimestamp(
|
|
|
|
key, /*a_has_ts=*/false,
|
|
|
|
ExtractUserKeyAndStripTimestamp(f.smallest_key,
|
|
|
|
user_comparator_->timestamp_size()),
|
|
|
|
/*b_has_ts=*/false) < 0) {
|
2014-09-25 20:34:51 +00:00
|
|
|
reader_list.push_back(nullptr);
|
|
|
|
} else {
|
|
|
|
f.fd.table_reader->Prepare(lkey.internal_key());
|
|
|
|
reader_list.push_back(f.fd.table_reader);
|
|
|
|
}
|
|
|
|
}
|
2022-05-24 19:14:10 +00:00
|
|
|
std::vector<Status> statuses(num_keys, Status::NotFound());
|
|
|
|
values->resize(num_keys);
|
|
|
|
if (timestamps) {
|
|
|
|
timestamps->resize(num_keys);
|
|
|
|
}
|
2014-09-25 20:34:51 +00:00
|
|
|
int idx = 0;
|
|
|
|
for (auto* r : reader_list) {
|
|
|
|
if (r != nullptr) {
|
2017-03-13 18:44:50 +00:00
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
std::string& value = (*values)[idx];
|
2022-05-24 19:14:10 +00:00
|
|
|
LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp);
|
|
|
|
std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr;
|
|
|
|
GetContext get_context(
|
|
|
|
user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
lkey.user_key(), &pinnable_val, /*columns=*/nullptr,
|
2022-05-24 19:14:10 +00:00
|
|
|
user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr,
|
|
|
|
nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb);
|
2020-09-29 16:47:33 +00:00
|
|
|
Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
|
2021-03-23 20:47:56 +00:00
|
|
|
assert(static_cast<size_t>(idx) < statuses.size());
|
2020-09-29 16:47:33 +00:00
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
statuses[idx] = s;
|
|
|
|
} else {
|
|
|
|
value.assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
if (get_context.State() == GetContext::kFound) {
|
|
|
|
statuses[idx] = Status::OK();
|
|
|
|
}
|
2014-09-25 20:34:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
++idx;
|
|
|
|
}
|
|
|
|
return statuses;
|
|
|
|
}
|
|
|
|
|
2014-09-25 18:14:01 +00:00
|
|
|
Status CompactedDBImpl::Init(const Options& options) {
|
2017-10-06 01:00:38 +00:00
|
|
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
2014-09-25 18:14:01 +00:00
|
|
|
mutex_.Lock();
|
|
|
|
ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
|
|
|
|
ColumnFamilyOptions(options));
|
2016-04-21 22:32:06 +00:00
|
|
|
Status s = Recover({cf}, true /* read only */, false, true);
|
2014-09-25 18:14:01 +00:00
|
|
|
if (s.ok()) {
|
2020-07-03 02:24:25 +00:00
|
|
|
cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
|
|
|
|
->cfd();
|
2017-10-06 01:00:38 +00:00
|
|
|
cfd_->InstallSuperVersion(&sv_context, &mutex_);
|
2014-09-25 18:14:01 +00:00
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
2017-10-06 01:00:38 +00:00
|
|
|
sv_context.Clean();
|
2014-09-25 18:14:01 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-11-20 18:49:32 +00:00
|
|
|
NewThreadStatusCfInfo(cfd_);
|
2014-09-25 18:14:01 +00:00
|
|
|
version_ = cfd_->GetSuperVersion()->current;
|
|
|
|
user_comparator_ = cfd_->user_comparator();
|
2014-10-31 15:48:19 +00:00
|
|
|
auto* vstorage = version_->storage_info();
|
2015-04-01 23:55:08 +00:00
|
|
|
if (vstorage->num_non_empty_levels() == 0) {
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
2014-10-27 22:49:46 +00:00
|
|
|
const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
|
2014-09-25 18:14:01 +00:00
|
|
|
// L0 should not have files
|
2014-10-28 17:03:13 +00:00
|
|
|
if (l0.num_files > 1) {
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::NotSupported("L0 contain more than 1 file");
|
|
|
|
}
|
2014-10-28 17:03:13 +00:00
|
|
|
if (l0.num_files == 1) {
|
2014-11-04 01:45:55 +00:00
|
|
|
if (vstorage->num_non_empty_levels() > 1) {
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::NotSupported("Both L0 and other level contain files");
|
|
|
|
}
|
2014-10-28 17:03:13 +00:00
|
|
|
files_ = l0;
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2014-11-04 01:45:55 +00:00
|
|
|
for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
|
2014-10-27 22:49:46 +00:00
|
|
|
if (vstorage->LevelFilesBrief(i).num_files > 0) {
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::NotSupported("Other levels also contain files");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-04 01:45:55 +00:00
|
|
|
int level = vstorage->num_non_empty_levels() - 1;
|
2014-10-27 22:49:46 +00:00
|
|
|
if (vstorage->LevelFilesBrief(level).num_files > 0) {
|
|
|
|
files_ = vstorage->LevelFilesBrief(level);
|
2014-09-25 18:14:01 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
|
|
|
|
2022-10-25 20:49:09 +00:00
|
|
|
Status CompactedDBImpl::Open(const Options& options, const std::string& dbname,
|
|
|
|
DB** dbptr) {
|
2014-09-25 18:14:01 +00:00
|
|
|
*dbptr = nullptr;
|
|
|
|
|
|
|
|
if (options.max_open_files != -1) {
|
|
|
|
return Status::InvalidArgument("require max_open_files = -1");
|
|
|
|
}
|
|
|
|
if (options.merge_operator.get() != nullptr) {
|
|
|
|
return Status::InvalidArgument("merge operator is not supported");
|
|
|
|
}
|
|
|
|
DBOptions db_options(options);
|
|
|
|
std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
|
|
|
|
Status s = db->Init(options);
|
|
|
|
if (s.ok()) {
|
2022-08-26 01:52:37 +00:00
|
|
|
s = db->StartPeriodicTaskScheduler();
|
2022-03-12 19:45:56 +00:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
2017-03-16 02:22:52 +00:00
|
|
|
ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
|
|
|
|
"Opened the db as fully compacted mode");
|
2016-09-23 23:34:04 +00:00
|
|
|
LogFlush(db->immutable_db_options_.info_log);
|
2014-09-25 18:14:01 +00:00
|
|
|
*dbptr = db.release();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|