mirror of
https://github.com/torvalds/linux.git
synced 2025-11-01 09:09:47 +02:00
bcachefs: Journal read error message improvements
- Don't print a checksum error when we first read a journal entry: we print a checksum error later if we'll be using the journal entry. - Continuing with the theme of of improving error messages and grouping errors into a single log message per error, print a single 'checksum error' message per journal entry, and use bch2_journal_ptr_to_text() to print out where on the device it was. - Factor out checksum error messages and checking for missing journal entries into helpers, bch2_journal_read() has gotten obnoxiously big. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
72ab5136e8
commit
cd04497b10
2 changed files with 141 additions and 90 deletions
|
|
@ -49,14 +49,31 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
|
|||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
struct journal_replay *j)
|
||||
static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
|
||||
{
|
||||
struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
|
||||
prt_printf(out, "%s %u:%u:%u (sector %llu)",
|
||||
ca ? ca->name : "(invalid dev)",
|
||||
p->dev, p->bucket, p->bucket_offset, p->sector);
|
||||
bch2_dev_put(ca);
|
||||
}
|
||||
|
||||
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
|
||||
{
|
||||
darray_for_each(j->ptrs, i) {
|
||||
if (i != j->ptrs.data)
|
||||
prt_printf(out, " ");
|
||||
prt_printf(out, "%u:%u:%u (sector %llu)",
|
||||
i->dev, i->bucket, i->bucket_offset, i->sector);
|
||||
bch2_journal_ptr_to_text(out, c, i);
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
|
||||
{
|
||||
for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
|
||||
struct jset_entry_datetime *datetime =
|
||||
container_of(entry, struct jset_entry_datetime, entry);
|
||||
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -64,15 +81,9 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
|
|||
struct journal_replay *j)
|
||||
{
|
||||
prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
|
||||
|
||||
bch2_journal_datetime_to_text(out, &j->j);
|
||||
prt_char(out, ' ');
|
||||
bch2_journal_ptrs_to_text(out, c, j);
|
||||
|
||||
for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
|
||||
struct jset_entry_datetime *datetime =
|
||||
container_of(entry, struct jset_entry_datetime, entry);
|
||||
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static struct nonce journal_nonce(const struct jset *jset)
|
||||
|
|
@ -1037,7 +1048,6 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
|
||||
end = offset + ca->mi.bucket_size;
|
||||
bool saw_bad = false, csum_good;
|
||||
struct printbuf err = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
pr_debug("reading %u", bucket);
|
||||
|
|
@ -1078,7 +1088,7 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
* found on a different device, and missing or
|
||||
* no journal entries will be handled later
|
||||
*/
|
||||
goto out;
|
||||
return 0;
|
||||
}
|
||||
|
||||
j = buf->data;
|
||||
|
|
@ -1095,12 +1105,12 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
ret = journal_read_buf_realloc(buf,
|
||||
vstruct_bytes(j));
|
||||
if (ret)
|
||||
goto err;
|
||||
return ret;
|
||||
}
|
||||
goto reread;
|
||||
case JOURNAL_ENTRY_NONE:
|
||||
if (!saw_bad)
|
||||
goto out;
|
||||
return 0;
|
||||
/*
|
||||
* On checksum error we don't really trust the size
|
||||
* field of the journal entry we read, so try reading
|
||||
|
|
@ -1109,7 +1119,7 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
sectors = block_sectors(c);
|
||||
goto next_block;
|
||||
default:
|
||||
goto err;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
|
||||
|
|
@ -1126,22 +1136,20 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
* bucket:
|
||||
*/
|
||||
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
|
||||
goto out;
|
||||
return 0;
|
||||
|
||||
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
|
||||
|
||||
enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
|
||||
struct bch_csum csum;
|
||||
csum_good = jset_csum_good(c, j, &csum);
|
||||
|
||||
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
|
||||
|
||||
if (!csum_good) {
|
||||
bch_err_dev_ratelimited(ca, "%s",
|
||||
(printbuf_reset(&err),
|
||||
prt_str(&err, "journal "),
|
||||
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
|
||||
err.buf));
|
||||
/*
|
||||
* Don't print an error here, we'll print the error
|
||||
* later if we need this journal entry
|
||||
*/
|
||||
saw_bad = true;
|
||||
}
|
||||
|
||||
|
|
@ -1153,6 +1161,7 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
mutex_lock(&jlist->lock);
|
||||
ret = journal_entry_add(c, ca, (struct journal_ptr) {
|
||||
.csum_good = csum_good,
|
||||
.csum = csum,
|
||||
.dev = ca->dev_idx,
|
||||
.bucket = bucket,
|
||||
.bucket_offset = offset -
|
||||
|
|
@ -1167,7 +1176,7 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
|
||||
break;
|
||||
default:
|
||||
goto err;
|
||||
return ret;
|
||||
}
|
||||
next_block:
|
||||
pr_debug("next");
|
||||
|
|
@ -1176,11 +1185,7 @@ static int journal_read_bucket(struct bch_dev *ca,
|
|||
j = ((void *) j) + (sectors << 9);
|
||||
}
|
||||
|
||||
out:
|
||||
ret = 0;
|
||||
err:
|
||||
printbuf_exit(&err);
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static CLOSURE_CALLBACK(bch2_journal_read_device)
|
||||
|
|
@ -1229,13 +1234,105 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
|
|||
goto out;
|
||||
}
|
||||
|
||||
noinline_for_stack
|
||||
static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
|
||||
bool have_good = false;
|
||||
|
||||
prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
|
||||
bch2_journal_datetime_to_text(&buf, &j->j);
|
||||
prt_newline(&buf);
|
||||
|
||||
darray_for_each(j->ptrs, ptr)
|
||||
if (!ptr->csum_good) {
|
||||
bch2_journal_ptr_to_text(&buf, c, ptr);
|
||||
prt_char(&buf, ' ');
|
||||
bch2_csum_to_text(&buf, csum_type, ptr->csum);
|
||||
prt_newline(&buf);
|
||||
} else {
|
||||
have_good = true;
|
||||
}
|
||||
|
||||
prt_printf(&buf, "should be ");
|
||||
bch2_csum_to_text(&buf, csum_type, j->j.csum);
|
||||
|
||||
if (have_good)
|
||||
prt_printf(&buf, "\n(had good copy on another device)");
|
||||
|
||||
bch2_print_str(c, KERN_ERR, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
noinline_for_stack
|
||||
static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
struct genradix_iter radix_iter;
|
||||
struct journal_replay *i, **_i, *prev = NULL;
|
||||
u64 seq = start_seq;
|
||||
|
||||
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (journal_replay_ignore(i))
|
||||
continue;
|
||||
|
||||
BUG_ON(seq > le64_to_cpu(i->j.seq));
|
||||
|
||||
while (seq < le64_to_cpu(i->j.seq)) {
|
||||
while (seq < le64_to_cpu(i->j.seq) &&
|
||||
bch2_journal_seq_is_blacklisted(c, seq, false))
|
||||
seq++;
|
||||
|
||||
if (seq == le64_to_cpu(i->j.seq))
|
||||
break;
|
||||
|
||||
u64 missing_start = seq;
|
||||
|
||||
while (seq < le64_to_cpu(i->j.seq) &&
|
||||
!bch2_journal_seq_is_blacklisted(c, seq, false))
|
||||
seq++;
|
||||
|
||||
u64 missing_end = seq - 1;
|
||||
|
||||
printbuf_reset(&buf);
|
||||
prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
|
||||
missing_start, missing_end,
|
||||
start_seq, end_seq);
|
||||
|
||||
prt_printf(&buf, "\nprev at ");
|
||||
if (prev) {
|
||||
bch2_journal_ptrs_to_text(&buf, c, prev);
|
||||
prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
|
||||
} else
|
||||
prt_printf(&buf, "(none)");
|
||||
|
||||
prt_printf(&buf, "\nnext at ");
|
||||
bch2_journal_ptrs_to_text(&buf, c, i);
|
||||
prt_printf(&buf, ", continue?");
|
||||
|
||||
fsck_err(c, journal_entries_missing, "%s", buf.buf);
|
||||
}
|
||||
|
||||
prev = i;
|
||||
seq++;
|
||||
}
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_journal_read(struct bch_fs *c,
|
||||
u64 *last_seq,
|
||||
u64 *blacklist_seq,
|
||||
u64 *start_seq)
|
||||
{
|
||||
struct journal_list jlist;
|
||||
struct journal_replay *i, **_i, *prev = NULL;
|
||||
struct journal_replay *i, **_i;
|
||||
struct genradix_iter radix_iter;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bool degraded = false, last_write_torn = false;
|
||||
|
|
@ -1354,56 +1451,9 @@ int bch2_journal_read(struct bch_fs *c,
|
|||
}
|
||||
}
|
||||
|
||||
/* Check for missing entries: */
|
||||
seq = *last_seq;
|
||||
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (journal_replay_ignore(i))
|
||||
continue;
|
||||
|
||||
BUG_ON(seq > le64_to_cpu(i->j.seq));
|
||||
|
||||
while (seq < le64_to_cpu(i->j.seq)) {
|
||||
u64 missing_start, missing_end;
|
||||
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
|
||||
|
||||
while (seq < le64_to_cpu(i->j.seq) &&
|
||||
bch2_journal_seq_is_blacklisted(c, seq, false))
|
||||
seq++;
|
||||
|
||||
if (seq == le64_to_cpu(i->j.seq))
|
||||
break;
|
||||
|
||||
missing_start = seq;
|
||||
|
||||
while (seq < le64_to_cpu(i->j.seq) &&
|
||||
!bch2_journal_seq_is_blacklisted(c, seq, false))
|
||||
seq++;
|
||||
|
||||
if (prev) {
|
||||
bch2_journal_ptrs_to_text(&buf1, c, prev);
|
||||
prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
|
||||
} else
|
||||
prt_printf(&buf1, "(none)");
|
||||
bch2_journal_ptrs_to_text(&buf2, c, i);
|
||||
|
||||
missing_end = seq - 1;
|
||||
fsck_err(c, journal_entries_missing,
|
||||
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
|
||||
"prev at %s\n"
|
||||
"next at %s, continue?",
|
||||
missing_start, missing_end,
|
||||
*last_seq, *blacklist_seq - 1,
|
||||
buf1.buf, buf2.buf);
|
||||
|
||||
printbuf_exit(&buf1);
|
||||
printbuf_exit(&buf2);
|
||||
}
|
||||
|
||||
prev = i;
|
||||
seq++;
|
||||
}
|
||||
ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
||||
union bch_replicas_padded replicas = {
|
||||
|
|
@ -1416,15 +1466,15 @@ int bch2_journal_read(struct bch_fs *c,
|
|||
if (journal_replay_ignore(i))
|
||||
continue;
|
||||
|
||||
darray_for_each(i->ptrs, ptr) {
|
||||
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
|
||||
|
||||
if (!ptr->csum_good)
|
||||
bch_err_dev_offset(ca, ptr->sector,
|
||||
"invalid journal checksum, seq %llu%s",
|
||||
le64_to_cpu(i->j.seq),
|
||||
i->csum_good ? " (had good copy on another device)" : "");
|
||||
}
|
||||
/*
|
||||
* Don't print checksum errors until we know we're going to use
|
||||
* a given journal entry:
|
||||
*/
|
||||
darray_for_each(i->ptrs, ptr)
|
||||
if (!ptr->csum_good) {
|
||||
bch2_journal_print_checksum_error(c, i);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = jset_validate(c,
|
||||
bch2_dev_have_ref(c, i->ptrs.data[0].dev),
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
|
|||
|
||||
struct journal_ptr {
|
||||
bool csum_good;
|
||||
struct bch_csum csum;
|
||||
u8 dev;
|
||||
u32 bucket;
|
||||
u32 bucket_offset;
|
||||
|
|
|
|||
Loading…
Reference in a new issue