git-backup.go 40.8 KB
Newer Older
1
// Copyright (C) 2015-2021  Nexedi SA and Contributors.
Kirill Smelkov's avatar
Kirill Smelkov committed
2 3 4 5 6 7
//                          Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
8 9 10 11 12 13
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
Kirill Smelkov's avatar
Kirill Smelkov committed
14 15 16 17
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
18
// See https://www.nexedi.com/licensing for rationale and options.
Kirill Smelkov's avatar
Kirill Smelkov committed
19 20

/*
21
Git-backup - Backup set of Git repositories & just files; efficiently.
Kirill Smelkov's avatar
Kirill Smelkov committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

This program backups files and set of bare Git repositories into one Git repository.
Files are copied to blobs and then added to tree under certain place, and for
Git repositories, all reachable objects are pulled in with maintaining index
which remembers reference -> sha1 for every pulled repositories.

After objects from backuped Git repositories are pulled in, we create new
commit which references tree with changed backup index and files, and also has
all head objects from pulled-in repositories in its parents(*). This way backup
has history and all pulled objects become reachable from single head commit in
backup repository. In particular that means that the whole state of backup can
be described with only single sha1, and that backup repository itself could be
synchronized via standard git pull/push, be repacked, etc.

Restoration process is the opposite - from a particular backup state, files are
extracted at a proper place, and for Git repositories a pack with all objects
reachable from that repository heads is prepared and extracted from backup
repository object database.

This approach allows to leverage Git's good ability for object contents
deduplication and packing, especially for cases when there are many hosted
repositories which are forks of each other with relatively minor changes in
between each other and over time, and mostly common base. In author experience
the size of backup is dramatically smaller compared to straightforward "let's
tar it all" approach.

Data for all backuped files and repositories can be accessed if one has access
to backup repository, so either they all should be in the same security domain,
or extra care has to be taken to protect access to backup repository.

File permissions are not managed with strict details due to inherent
nature of Git. This aspect can be improved with e.g. etckeeper-like
(http://etckeeper.branchable.com/) approach if needed.

Please see README.rst with user-level overview on how to use git-backup.

NOTE the idea of pulling all refs together is similar to git-namespaces
     http://git-scm.com/docs/gitnamespaces

(*) Tag objects are handled specially - because in a lot of places Git insists and
    assumes commit parents can only be commit objects. We encode tag objects in
    specially-crafted commit object on pull, and decode back on backup restore.

    We do likewise if a ref points to tree or blob, which is valid in Git.
*/
package main

import (
70
	"context"
Kirill Smelkov's avatar
Kirill Smelkov committed
71 72 73 74
	"flag"
	"fmt"
	"io/ioutil"
	"os"
75
	"os/signal"
Kirill Smelkov's avatar
Kirill Smelkov committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
	pathpkg "path"
	"path/filepath"
	"runtime"
	"runtime/debug"
	"sort"
	"strings"
	"syscall"
	"time"

	"lab.nexedi.com/kirr/go123/exc"
	"lab.nexedi.com/kirr/go123/mem"
	"lab.nexedi.com/kirr/go123/my"
	"lab.nexedi.com/kirr/go123/xerr"
	"lab.nexedi.com/kirr/go123/xflag"
	"lab.nexedi.com/kirr/go123/xstrings"
91
	"lab.nexedi.com/kirr/go123/xsync"
Kirill Smelkov's avatar
Kirill Smelkov committed
92

93
	git "github.com/libgit2/git2go/v31"
Kirill Smelkov's avatar
Kirill Smelkov committed
94 95 96 97 98 99 100 101 102 103
)

// verbose output
// 0 - silent
// 1 - info
// 2 - progress of long-running operations
// 3 - debug
var verbose = 1

func infof(format string, a ...interface{}) {
Kirill Smelkov's avatar
Kirill Smelkov committed
104 105 106 107
	if verbose > 0 {
		fmt.Printf(format, a...)
		fmt.Println()
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
108 109 110 111 112
}

// what to pass to git subprocess to stdout/stderr
// DontRedirect - no-redirection, PIPE - output to us
func gitprogress() StdioRedirect {
Kirill Smelkov's avatar
Kirill Smelkov committed
113 114 115 116
	if verbose > 1 {
		return DontRedirect
	}
	return PIPE
Kirill Smelkov's avatar
Kirill Smelkov committed
117 118 119
}

func debugf(format string, a ...interface{}) {
Kirill Smelkov's avatar
Kirill Smelkov committed
120 121 122 123
	if verbose > 2 {
		fmt.Printf(format, a...)
		fmt.Println()
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
124 125
}

126 127 128
// how many max jobs to spawn
var njobs = runtime.NumCPU()

129
// -------- create/extract blob --------
Kirill Smelkov's avatar
Kirill Smelkov committed
130 131

// file -> blob_sha1, mode
132
func file_to_blob(g *git.Repository, path string) (Sha1, uint32) {
Kirill Smelkov's avatar
Kirill Smelkov committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
	var blob_content []byte

	// because we want to pass mode to outside world (to e.g. `git update-index`)
	// we need to get native OS mode, not translated one as os.Lstat() would give us.
	var st syscall.Stat_t
	err := syscall.Lstat(path, &st)
	if err != nil {
		exc.Raise(&os.PathError{"lstat", path, err})
	}

	if st.Mode&syscall.S_IFMT == syscall.S_IFLNK {
		__, err := os.Readlink(path)
		blob_content = mem.Bytes(__)
		exc.Raiseif(err)
	} else {
		blob_content, err = ioutil.ReadFile(path)
		exc.Raiseif(err)
	}

	blob_sha1, err := WriteObject(g, blob_content, git.ObjectBlob)
	exc.Raiseif(err)

	return blob_sha1, st.Mode
Kirill Smelkov's avatar
Kirill Smelkov committed
156 157 158
}

// blob_sha1, mode -> file
159
func blob_to_file(g *git.Repository, blob_sha1 Sha1, mode uint32, path string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	blob, err := ReadObject(g, blob_sha1, git.ObjectBlob)
	exc.Raiseif(err)
	blob_content := blob.Data()

	err = os.MkdirAll(pathpkg.Dir(path), 0777)
	exc.Raiseif(err)

	if mode&syscall.S_IFMT == syscall.S_IFLNK {
		err = os.Symlink(mem.String(blob_content), path)
		exc.Raiseif(err)
	} else {
		// NOTE mode is native - we cannot use ioutil.WriteFile() directly
		err = writefile(path, blob_content, mode)
		exc.Raiseif(err)
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
175 176 177 178 179 180 181 182 183 184 185 186 187 188
}

// -------- tags representation --------

// represent tag/tree/blob as specially crafted commit
//
// The reason we do this is that we want refs/tag/* to be parents of synthetic
// backup commit, but git does not allow tag objects to be in commit parents.
// Also besides commit and tag, it is possible for a ref to point to a tree or blob.
//
// We always attach original tagged object to crafted commit in one way or
// another, so that on backup restore we only have to recreate original tag
// object and tagged object is kept there in repo thanks to it being reachable
// through created commit.
189
func obj_represent_as_commit(ctx context.Context, g *git.Repository, sha1 Sha1, obj_type git.ObjectType) Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
	switch obj_type {
	case git.ObjectTag, git.ObjectTree, git.ObjectBlob:
		// ok
	default:
		exc.Raisef("%s (%s): cannot encode as commit", sha1, obj_type)
	}

	// first line in commit msg = object type
	obj_encoded := gittypestr(obj_type) + "\n"
	var tagged_type git.ObjectType
	var tagged_sha1 Sha1

	// below the code layout is mainly for tag type, and we hook tree and blob
	// types handling into that layout
	if obj_type == git.ObjectTag {
		tag, tag_obj := xload_tag(g, sha1)
		tagged_type = tag.tagged_type
		tagged_sha1 = tag.tagged_sha1
		obj_encoded += mem.String(tag_obj.Data())
	} else {
		// for tree/blob we only care that object stays reachable
		tagged_type = obj_type
		tagged_sha1 = sha1
	}

	// all commits we do here - we do with fixed name/date, so transformation
	// tag->commit is stable wrt git environment and time change
	fixed := AuthorInfo{Name: "Git backup", Email: "git@backup.org", When: time.Unix(0, 0).UTC()}
	zcommit_tree := func(tree Sha1, parents []Sha1, msg string) Sha1 {
		return xcommit_tree2(g, tree, parents, msg, fixed, fixed)
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> ø
	// Commit             .parent -> Commit
	if tagged_type == git.ObjectCommit {
227
		return zcommit_tree(mktree_empty(ctx), []Sha1{tagged_sha1}, obj_encoded)
Kirill Smelkov's avatar
Kirill Smelkov committed
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> Tree
	// Tree               .parent -> ø
	if tagged_type == git.ObjectTree {
		return zcommit_tree(tagged_sha1, []Sha1{}, obj_encoded)
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> Tree* "tagged" -> Blob
	// Blob               .parent -> ø
	if tagged_type == git.ObjectBlob {
243
		tree_for_blob := xgitSha1(ctx, "mktree", RunWith{stdin: fmt.Sprintf("100644 blob %s\ttagged\n", tagged_sha1)})
Kirill Smelkov's avatar
Kirill Smelkov committed
244 245 246 247 248 249 250 251
		return zcommit_tree(tree_for_blob, []Sha1{}, obj_encoded)
	}

	// Tag₂       ~>     Commit₂*
	//  |                 .msg:      Tag₂
	//  v                 .tree   -> ø
	// Tag₁               .parent -> Commit₁*
	if tagged_type == git.ObjectTag {
252 253
		commit1 := obj_represent_as_commit(ctx, g, tagged_sha1, tagged_type)
		return zcommit_tree(mktree_empty(ctx), []Sha1{commit1}, obj_encoded)
Kirill Smelkov's avatar
Kirill Smelkov committed
254 255 256 257
	}

	exc.Raisef("%s (%q): unknown tagged type", sha1, tagged_type)
	panic(0)
Kirill Smelkov's avatar
Kirill Smelkov committed
258 259 260 261
}

// recreate tag/tree/blob from specially crafted commit
// (see obj_represent_as_commit() about how a objects are originally translated into commit)
262 263 264 265
// returns:
//   - tag:       recreated object sha1
//   - tree/blob: null sha1
func obj_recreate_from_commit(g *git.Repository, commit_sha1 Sha1) Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
	xraise := func(info interface{}) { exc.Raise(&RecreateObjError{commit_sha1, info}) }
	xraisef := func(f string, a ...interface{}) { xraise(fmt.Sprintf(f, a...)) }

	commit, err := g.LookupCommit(commit_sha1.AsOid())
	if err != nil {
		xraise(err)
	}
	if commit.ParentCount() > 1 {
		xraise(">1 parents")
	}

	obj_type, obj_raw, err := xstrings.HeadTail(commit.Message(), "\n")
	if err != nil {
		xraise("invalid encoded format")
	}
	switch obj_type {
	case "tag", "tree", "blob":
		// ok
	default:
		xraisef("unexpected encoded object type %q", obj_type)
	}

	// for tree/blob we do not need to do anything - that objects were reachable
	// from commit and are present in git db.
	if obj_type == "tree" || obj_type == "blob" {
		return Sha1{}
	}

	// re-create tag object
	tag_sha1, err := WriteObject(g, mem.Bytes(obj_raw), git.ObjectTag)
	exc.Raiseif(err)

	// the original tagged object should be already in repository, because we
	// always attach it to encoding commit one way or another,
	// except we need to recurse, if it was Tag₂->Tag₁
	tag, err := tag_parse(obj_raw)
	if err != nil {
		xraisef("encoded tag: %s", err)
	}
	if tag.tagged_type == git.ObjectTag {
		if commit.ParentCount() == 0 {
			xraise("encoded tag corrupt (tagged is tag but []parent is empty)")
		}
		obj_recreate_from_commit(g, Sha1FromOid(commit.ParentId(0)))
	}

	return tag_sha1
Kirill Smelkov's avatar
Kirill Smelkov committed
313 314 315
}

type RecreateObjError struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
316 317
	commit_sha1 Sha1
	info        interface{}
Kirill Smelkov's avatar
Kirill Smelkov committed
318 319 320
}

func (e *RecreateObjError) Error() string {
Kirill Smelkov's avatar
Kirill Smelkov committed
321
	return fmt.Sprintf("commit %s: %s", e.commit_sha1, e.info)
Kirill Smelkov's avatar
Kirill Smelkov committed
322 323 324 325 326
}

// -------- git-backup pull --------

func cmd_pull_usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
327
	fmt.Fprint(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
328 329 330 331 332 333 334 335
`git-backup pull <dir1>:<prefix1> <dir2>:<prefix2> ...

Pull bare Git repositories & just files from dir1 into backup prefix1,
from dir2 into backup prefix2, etc...
`)
}

type PullSpec struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
336
	dir, prefix string
Kirill Smelkov's avatar
Kirill Smelkov committed
337 338
}

339
func cmd_pull(ctx context.Context, gb *git.Repository, argv []string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
	flags := flag.FlagSet{Usage: cmd_pull_usage}
	flags.Init("", flag.ExitOnError)
	flags.Parse(argv)

	argv = flags.Args()
	if len(argv) < 1 {
		cmd_pull_usage()
		os.Exit(1)
	}

	pullspecv := []PullSpec{}
	for _, arg := range argv {
		dir, prefix, err := xstrings.Split2(arg, ":")
		if err != nil {
			fmt.Fprintf(os.Stderr, "E: invalid pullspec %q\n", arg)
			cmd_pull_usage()
			os.Exit(1)
		}

		pullspecv = append(pullspecv, PullSpec{dir, prefix})
	}

362
	cmd_pull_(ctx, gb, pullspecv)
Kirill Smelkov's avatar
Kirill Smelkov committed
363 364
}

365
// Ref is info about a reference pointing to sha1.
Kirill Smelkov's avatar
Kirill Smelkov committed
366
type Ref struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
367 368
	name string // reference name without "refs/" prefix
	sha1 Sha1
Kirill Smelkov's avatar
Kirill Smelkov committed
369 370
}

371
func cmd_pull_(ctx context.Context, gb *git.Repository, pullspecv []PullSpec) {
Kirill Smelkov's avatar
Kirill Smelkov committed
372 373 374 375 376
	// while pulling, we'll keep refs from all pulled repositories under temp
	// unique work refs namespace.
	backup_time := time.Now().Format("20060102-1504")               // %Y%m%d-%H%M
	backup_refs_work := fmt.Sprintf("refs/backup/%s/", backup_time) // refs/backup/20150820-2109/

377 378
	// prevent another `git-backup pull` from running simultaneously
	backup_lock := "refs/backup.locked"
379 380
	xgit(ctx, "update-ref", backup_lock, mktree_empty(ctx), Sha1{})
	defer xgit(context.Background(), "update-ref", "-d", backup_lock)
Kirill Smelkov's avatar
Kirill Smelkov committed
381 382 383 384

	// make sure there is root commit
	var HEAD Sha1
	var err error
385
	gerr, __, _ := ggit(ctx, "rev-parse", "--verify", "HEAD")
Kirill Smelkov's avatar
Kirill Smelkov committed
386 387 388
	if gerr != nil {
		infof("# creating root commit")
		// NOTE `git commit` does not work in bare repo - do commit by hand
389 390
		HEAD = xcommit_tree(gb, mktree_empty(ctx), []Sha1{}, "Initialize git-backup repository")
		xgit(ctx, "update-ref", "-m", "git-backup pull init", "HEAD", HEAD)
Kirill Smelkov's avatar
Kirill Smelkov committed
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
	} else {
		HEAD, err = Sha1Parse(__)
		exc.Raiseif(err)
	}

	// build index of "already-have" objects: all commits + tag/tree/blob that
	// were at heads of already pulled repositories.
	//
	// Build it once and use below to check ourselves whether a head from a pulled
	// repository needs to be actually fetched. If we don't, `git fetch-pack`
	// will do similar to "all commits" linear scan for every pulled repository,
	// which are many out there.
	alreadyHave := Sha1Set{}
	infof("# building \"already-have\" index")

	// already have: all commits
	//
	// As of lab.nexedi.com/20180612 there are ~ 1.7·10⁷ objects total in backup.
	// Of those there are ~ 1.9·10⁶ commit objects, i.e. ~10% of total.
	// Since 1 sha1 is 2·10¹ bytes, the space needed for keeping sha1 of all
	// commits is ~ 4·10⁷B = ~40MB. It is thus ok to keep this index in RAM for now.
412
	for _, __ := range xstrings.SplitLines(xgit(ctx, "rev-list", HEAD), "\n") {
Kirill Smelkov's avatar
Kirill Smelkov committed
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
		sha1, err := Sha1Parse(__)
		exc.Raiseif(err)
		alreadyHave.Add(sha1)
	}

	// already have: tag/tree/blob that were at heads of already pulled repositories
	//
	// As of lab.nexedi.com/20180612 there are ~ 8.4·10⁴ refs in total.
	// Of those encoded tag/tree/blob are ~ 3.2·10⁴, i.e. ~40% of total.
	// The number of tag/tree/blob objects in alreadyHave is thus negligible
	// compared to the number of "all commits".
	hcommit, err := gb.LookupCommit(HEAD.AsOid())
	exc.Raiseif(err)
	htree, err := hcommit.Tree()
	exc.Raiseif(err)
	if htree.EntryByName("backup.refs") != nil {
429
		repotab, err := loadBackupRefs(ctx, fmt.Sprintf("%s:backup.refs", HEAD))
Kirill Smelkov's avatar
Kirill Smelkov committed
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
		exc.Raiseif(err)

		for _, repo := range repotab {
			for _, xref := range repo.refs {
				if xref.sha1 != xref.sha1_ && !alreadyHave.Contains(xref.sha1) {
					// make sure encoded tag/tree/blob objects represented as
					// commits are present. We do so, because we promise to
					// fetch that all objects in alreadyHave are present.
					obj_recreate_from_commit(gb, xref.sha1_)

					alreadyHave.Add(xref.sha1)
				}
			}
		}
	}

	// walk over specified dirs, pulling objects from git and blobbing non-git-object files
	blobbedv := []string{} // info about file pulled to blob, and not yet added to index
	for _, __ := range pullspecv {
		dir, prefix := __.dir, __.prefix

		// make sure index is empty for prefix (so that we start from clean
		// prefix namespace and this way won't leave stale removed things)
453
		xgit(ctx, "rm", "--cached", "-r", "--ignore-unmatch", "--", prefix)
Kirill Smelkov's avatar
Kirill Smelkov committed
454 455 456 457 458 459 460 461 462 463 464 465 466

		here := my.FuncName()
		err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) (errout error) {
			if err != nil {
				if os.IsNotExist(err) {
					// a file or directory was removed in parallel to us scanning the tree.
					infof("Warning: Skipping %s: %s", path, err)
					return nil
				}
				// any other error -> stop
				return err
			}

467 468 469 470
			if ctx.Err() != nil {
				return ctx.Err()
			}

Kirill Smelkov's avatar
Kirill Smelkov committed
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
			// propagate exceptions properly via filepath.Walk as errors with calling context
			// (filepath is not our code)
			defer exc.Catch(func(e *exc.Error) {
				errout = exc.Addcallingcontext(here, e)
			})

			// files -> blobs + queue info for adding blobs to index
			if !info.IsDir() {
				infof("# file %s\t<- %s", prefix, path)
				blob, mode := file_to_blob(gb, path)
				blobbedv = append(blobbedv,
					fmt.Sprintf("%o %s\t%s", mode, blob, reprefix(dir, prefix, path)))
				return nil
			}

			// directories -> look for *.git and handle git object specially.

			// do not recurse into *.git/objects/  - we'll save them specially
			if strings.HasSuffix(path, ".git/objects") {
				return filepath.SkipDir
			}

			// else we recurse, but handle *.git specially - via fetching objects from it
			if !strings.HasSuffix(path, ".git") {
				return nil
			}
497 498 499 500 501 502 503
			head, err := os.Stat(path + "/HEAD")
			if os.IsNotExist(err) || head.IsDir() {
				return nil // not a git repository
			}
			if err != nil {
				return err
			}
Kirill Smelkov's avatar
Kirill Smelkov committed
504 505 506

			// git repo - let's pull all refs from it to our backup refs namespace
			infof("# git  %s\t<- %s", prefix, path)
507
			refv, _, err := fetch(ctx, path, alreadyHave)
Kirill Smelkov's avatar
Kirill Smelkov committed
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
			exc.Raiseif(err)

			// TODO don't store to git references all references from fetched repository:
			//
			//      We need to store to git references only references that were actually
			//      fetched - so that next fetch, e.g. from a fork that also has new data
			//      as its upstream, won't have to transfer what we just have fetched
			//      from upstream.
			//
			//      For this purpose we can also save references by naming them as their
			//      sha1, not actual name, which will automatically deduplicate them in
			//      between several repositories, especially when/if pull will be made to
			//      work in parallel.
			//
			//      Such changed-only deduplicated references should be O(δ) - usually only
			//      a few, and this way we will also automatically avoid O(n^2) behaviour
			//      of every git fetch scanning all local references at its startup.
			//
			//      For backup.refs, we can generate it directly from refv of all fetched
			//      repositories saved in RAM.
			reporefprefix := backup_refs_work +
				// NOTE repo name is escaped as it can contain e.g. spaces, and refs must not
				path_refescape(reprefix(dir, prefix, path))
			for _, ref := range refv {
				err = mkref(gb, reporefprefix+"/"+ref.name, ref.sha1)
				exc.Raiseif(err)
			}

			// XXX do we want to do full fsck of source git repo on pull as well ?

			return nil
		})

		// re-raise / raise error after Walk
		if err != nil {
			e := exc.Aserror(err)
			e = exc.Addcontext(e, "pulling from "+dir)
			exc.Raise(e)
		}
	}

	// add to index files we converted to blobs
550
	xgit(ctx, "update-index", "--add", "--index-info", RunWith{stdin: strings.Join(blobbedv, "\n")})
Kirill Smelkov's avatar
Kirill Smelkov committed
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567

	// all refs from all found git repositories populated.
	// now prepare manifest with ref -> sha1 and do a synthetic commit merging all that sha1
	// (so they become all reachable from HEAD -> survive repack and be transferable on git pull)
	//
	// NOTE we handle tag/tree/blob objects specially - because these objects cannot
	// be in commit parents, we convert them to specially-crafted commits and use them.
	// The commits prepared contain full info how to restore original objects.

	// backup.refs format:
	//
	//   1eeb0324 <prefix>/wendelin.core.git/heads/master
	//   213a9243 <prefix>/wendelin.core.git/tags/v0.4 <213a9243-converted-to-commit>
	//   ...
	//
	// NOTE `git for-each-ref` sorts output by ref
	//      -> backup_refs is sorted and stable between runs
568
	backup_refs_dump := xgit(ctx, "for-each-ref", backup_refs_work)
Kirill Smelkov's avatar
Kirill Smelkov committed
569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
	backup_refs_list := []Ref{}       // parsed dump
	backup_refsv := []string{}        // backup.refs content
	backup_refs_parents := Sha1Set{}  // sha1 for commit parents, obtained from refs
	noncommit_seen := map[Sha1]Sha1{} // {} sha1 -> sha1_ (there are many duplicate tags)
	for _, __ := range xstrings.SplitLines(backup_refs_dump, "\n") {
		sha1, type_, ref := Sha1{}, "", ""
		_, err := fmt.Sscanf(__, "%s %s %s\n", &sha1, &type_, &ref)
		if err != nil {
			exc.Raisef("%s: strange for-each-ref entry %q", backup_refs_work, __)
		}
		backup_refs_list = append(backup_refs_list, Ref{ref, sha1})
		backup_refs_entry := fmt.Sprintf("%s %s", sha1, strip_prefix(backup_refs_work, ref))

		// represent tag/tree/blob as specially crafted commit, because we
		// cannot use it as commit parent.
		sha1_ := sha1
		if type_ != "commit" {
			//infof("obj_as_commit %s  %s\t%s", sha1, type_, ref)  XXX
			var seen bool
			sha1_, seen = noncommit_seen[sha1]
			if !seen {
				obj_type, ok := gittype(type_)
				if !ok {
					exc.Raisef("%s: invalid git type in entry %q", backup_refs_work, __)
				}
594
				sha1_ = obj_represent_as_commit(ctx, gb, sha1, obj_type)
Kirill Smelkov's avatar
Kirill Smelkov committed
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
				noncommit_seen[sha1] = sha1_
			}

			backup_refs_entry += fmt.Sprintf(" %s", sha1_)
		}

		backup_refsv = append(backup_refsv, backup_refs_entry)

		if !backup_refs_parents.Contains(sha1_) { // several refs can refer to the same sha1
			backup_refs_parents.Add(sha1_)
		}
	}

	backup_refs := strings.Join(backup_refsv, "\n")
	backup_refs_parentv := backup_refs_parents.Elements()
	sort.Sort(BySha1(backup_refs_parentv)) // so parents order is stable in between runs

	// backup_refs -> blob
613
	backup_refs_sha1 := xgitSha1(ctx, "hash-object", "-w", "--stdin", RunWith{stdin: backup_refs})
Kirill Smelkov's avatar
Kirill Smelkov committed
614 615

	// add backup_refs blob to index
616
	xgit(ctx, "update-index", "--add", "--cacheinfo", fmt.Sprintf("100644,%s,backup.refs", backup_refs_sha1))
Kirill Smelkov's avatar
Kirill Smelkov committed
617 618

	// index is ready - prepare tree and commit
619
	backup_tree_sha1 := xgitSha1(ctx, "write-tree")
Kirill Smelkov's avatar
Kirill Smelkov committed
620 621 622
	commit_sha1 := xcommit_tree(gb, backup_tree_sha1, append([]Sha1{HEAD}, backup_refs_parentv...),
		"Git-backup "+backup_time)

623
	xgit(ctx, "update-ref", "-m", "git-backup pull", "HEAD", commit_sha1, HEAD)
Kirill Smelkov's avatar
Kirill Smelkov committed
624 625 626 627 628 629 630

	// remove no-longer needed backup refs & verify they don't stay
	backup_refs_delete := ""
	for _, ref := range backup_refs_list {
		backup_refs_delete += fmt.Sprintf("delete %s %s\n", ref.name, ref.sha1)
	}

631 632
	xgit(ctx, "update-ref", "--stdin", RunWith{stdin: backup_refs_delete})
	__ = xgit(ctx, "for-each-ref", backup_refs_work)
Kirill Smelkov's avatar
Kirill Smelkov committed
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
	if __ != "" {
		exc.Raisef("Backup refs under %s not deleted properly", backup_refs_work)
	}

	// NOTE  `delete` deletes only files, but leaves empty dirs around.
	//       more important: this affect performance of future `git-backup pull` run a *LOT*
	//
	//       reason is: `git pull` first check local refs, and for doing so it
	//       recourse into all directories, even empty ones.
	//
	//       https://lab.nexedi.com/lab.nexedi.com/lab.nexedi.com/issues/4
	//
	//       So remove all dirs under backup_refs_work prefix in the end.
	//
	// TODO  Revisit this when reworking fetch to be parallel. Reason is: in
	//       the process of pulling repositories, the more references we
	//       accumulate, the longer pull starts to be, so it becomes O(n^2).
	//
	//       -> what to do is described nearby fetch/mkref call.
652
	gitdir := xgit(ctx, "rev-parse", "--git-dir")
Kirill Smelkov's avatar
Kirill Smelkov committed
653 654 655 656
	err = os.RemoveAll(gitdir + "/" + backup_refs_work)
	exc.Raiseif(err) // NOTE err is nil if path does not exist

	// if we have working copy - update it
657
	bare := xgit(ctx, "rev-parse", "--is-bare-repository")
Kirill Smelkov's avatar
Kirill Smelkov committed
658 659 660 661 662 663
	if bare != "true" {
		// `git checkout-index -af`  -- does not delete deleted files
		// `git read-tree -v -u --reset HEAD~ HEAD`  -- needs index matching
		// original worktree to properly work, but we already have updated index
		//
		// so we get changes we committed as diff and apply to worktree
664
		diff := xgit(ctx, "diff", "--binary", HEAD, "HEAD", RunWith{raw: true})
Kirill Smelkov's avatar
Kirill Smelkov committed
665
		if diff != "" {
666
			diffstat := xgit(ctx, "apply", "--stat", "--apply", "--binary", "--whitespace=nowarn",
Kirill Smelkov's avatar
Kirill Smelkov committed
667 668 669 670
				RunWith{stdin: diff, raw: true})
			infof("%s", diffstat)
		}
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
671 672
}

673 674 675 676 677 678
// fetch makes sure all objects from a repository are present in backup place.
//
// It fetches objects that are potentially missing in backup from the
// repository in question. The objects considered to fetch are those, that are
// reachable from all repository references.
//
679 680 681 682 683 684 685 686 687 688 689 690 691
// AlreadyHave can be given to indicate knowledge on what objects our repository
// already has. If remote advertises tip with sha1 in alreadyHave, that tip won't be
// fetched. Notice: alreadyHave is consulted directly - no reachability scan is
// performed on it.
//
// All objects reachable from alreadyHave must be in our repository.
// AlreadyHave does not need to be complete - if we have something that is not
// in alreadyHave - it can affect only speed, not correctness.
//
// Returned are 2 lists of references from the source repository:
//
//  - list of all references, and
//  - list of references we actually had to fetch.
692 693 694
//
// Note: fetch does not create any local references - the references returned
// only describe state of references in fetched source repository.
695
func fetch(ctx context.Context, repo string, alreadyHave Sha1Set) (refv, fetchedv []Ref, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
696 697 698
	defer xerr.Contextf(&err, "fetch %s", repo)

	// first check which references are advertised
699
	refv, err = lsremote(ctx, repo)
Kirill Smelkov's avatar
Kirill Smelkov committed
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
	if err != nil {
		return nil, nil, err
	}

	// check if we already have something
	var fetchv []Ref // references we need to actually fetch.
	for _, ref := range refv {
		if !alreadyHave.Contains(ref.sha1) {
			fetchv = append(fetchv, ref)
		}
	}

	// if there is nothing to fetch - we are done
	if len(fetchv) == 0 {
		return refv, fetchv, nil
	}

	// fetch by sha1 what we don't already have from advertised.
	//
	// even if refs would change after ls-remote but before here, we should be
	// getting exactly what was advertised.
	//
	// related link on the subject:
	// https://git.kernel.org/pub/scm/git/git.git/commit/?h=051e4005a3
	var argv []interface{}
	arg := func(v ...interface{}) { argv = append(argv, v...) }
	arg(
		// check objects for corruption as they are fetched
		"-c", "fetch.fsckObjects=true",
		"fetch-pack", "--thin",

		// force upload-pack to allow us asking any sha1 we want.
		// needed because advertised refs we got at lsremote time could have changed.
		"--upload-pack=git -c uploadpack.allowAnySHA1InWant=true"+
			// workarounds for git < 2.11.1, which does not have uploadpack.allowAnySHA1InWant:
			" -c uploadpack.allowTipSHA1InWant=true -c uploadpack.allowReachableSHA1InWant=true"+
			//
			" upload-pack",

		repo)
	for _, ref := range fetchv {
		arg(ref.sha1)
	}
	arg(RunWith{stderr: gitprogress()})

745
	gerr, _, _ := ggit(ctx, argv...)
Kirill Smelkov's avatar
Kirill Smelkov committed
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
	if gerr != nil {
		return nil, nil, gerr
	}

	// fetch-pack ran ok - now check that all fetched tips are indeed fully
	// connected and that we also have all referenced blob/tree objects. The
	// reason for this check is that source repository could send us a pack with
	// e.g. some objects missing and this way even if fetch-pack would report
	// success, chances could be we won't have all the objects we think we
	// fetched.
	//
	// when checking we assume that the roots we already have at all our
	// references are ok.
	//
	// related link on the subject:
	// https://git.kernel.org/pub/scm/git/git.git/commit/?h=6d4bb3833c
	argv = nil
	arg("rev-list", "--quiet", "--objects", "--not", "--all", "--not")
	for _, ref := range fetchv {
		arg(ref.sha1)
	}
	arg(RunWith{stderr: gitprogress()})

769
	gerr, _, _ = ggit(ctx, argv...)
Kirill Smelkov's avatar
Kirill Smelkov committed
770 771 772 773 774 775
	if gerr != nil {
		return nil, nil, fmt.Errorf("remote did not send all neccessary objects")
	}

	// fetched ok
	return refv, fetchv, nil
776 777 778
}

// lsremote lists all references advertised by repo.
779
func lsremote(ctx context.Context, repo string) (refv []Ref, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
780 781 782 783 784 785 786 787 788 789 790 791
	defer xerr.Contextf(&err, "lsremote %s", repo)

	// NOTE --refs instructs to omit peeled refs like
	//
	//   c668db59ccc59e97ce81f769d9f4633e27ad3bdb refs/tags/v0.1
	//   4b6821f4a4e4c9648941120ccbab03982e33104f refs/tags/v0.1^{}  <--
	//
	// because fetch-pack errors on them:
	//
	//   https://public-inbox.org/git/20180610143231.7131-1-kirr@nexedi.com/
	//
	// we don't need to pull them anyway.
792
	gerr, stdout, _ := ggit(ctx, "ls-remote", "--refs", repo)
Kirill Smelkov's avatar
Kirill Smelkov committed
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
	if gerr != nil {
		return nil, gerr
	}

	//  oid refname
	//  oid refname
	//  ...
	for _, entry := range xstrings.SplitLines(stdout, "\n") {
		sha1, ref := Sha1{}, ""
		_, err := fmt.Sscanf(entry, "%s %s\n", &sha1, &ref)
		if err != nil {
			return nil, fmt.Errorf("strange output entry: %q", entry)
		}

		// Ref says its name goes without "refs/" prefix.
		if !strings.HasPrefix(ref, "refs/") {
			return nil, fmt.Errorf("non-refs/ reference: %q", ref)
		}
		ref = strings.TrimPrefix(ref, "refs/")

		refv = append(refv, Ref{ref, sha1})
	}

	return refv, nil
817 818
}

Kirill Smelkov's avatar
Kirill Smelkov committed
819 820 821
// -------- git-backup restore --------

func cmd_restore_usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
822
	fmt.Fprint(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
823 824 825 826 827 828 829 830 831 832
`git-backup restore <commit-ish> <prefix1>:<dir1> <prefix2>:<dir2> ...

Restore Git repositories & just files from backup prefix1 into dir1,
from backup prefix2 into dir2, etc...

Backup state to restore is taken from <commit-ish>.
`)
}

type RestoreSpec struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
833
	prefix, dir string
Kirill Smelkov's avatar
Kirill Smelkov committed
834 835
}

836
func cmd_restore(ctx context.Context, gb *git.Repository, argv []string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
	flags := flag.FlagSet{Usage: cmd_restore_usage}
	flags.Init("", flag.ExitOnError)
	flags.Parse(argv)

	argv = flags.Args()
	if len(argv) < 2 {
		cmd_restore_usage()
		os.Exit(1)
	}

	HEAD := argv[0]

	restorespecv := []RestoreSpec{}
	for _, arg := range argv[1:] {
		prefix, dir, err := xstrings.Split2(arg, ":")
		if err != nil {
			fmt.Fprintf(os.Stderr, "E: invalid restorespec %q\n", arg)
			cmd_restore_usage()
			os.Exit(1)
		}

		restorespecv = append(restorespecv, RestoreSpec{prefix, dir})
	}

861
	cmd_restore_(ctx, gb, HEAD, restorespecv)
Kirill Smelkov's avatar
Kirill Smelkov committed
862 863 864 865 866
}

// kirr/wendelin.core.git/heads/master -> kirr/wendelin.core.git, heads/master
// tiwariayush/Discussion%20Forum%20.git/... -> tiwariayush/Discussion Forum .git, ...
func reporef_split(reporef string) (repo, ref string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
867 868 869 870 871 872 873 874
	dotgit := strings.Index(reporef, ".git/")
	if dotgit == -1 {
		exc.Raisef("E: %s is not a ref for a git repo", reporef)
	}
	repo, ref = reporef[:dotgit+4], reporef[dotgit+4+1:]
	repo, err := path_refunescape(repo) // unescape repo name we originally escaped when making backup
	exc.Raiseif(err)
	return repo, ref
Kirill Smelkov's avatar
Kirill Smelkov committed
875 876 877 878
}

// sha1 value(s) for a ref in 'backup.refs'
type BackupRefSha1 struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
879 880 881
	sha1  Sha1 // original sha1 this ref was pointing to in original repo
	sha1_ Sha1 // sha1 actually used to represent sha1's object in backup repo
	           // (for tag/tree/blob - they are converted to commits)
Kirill Smelkov's avatar
Kirill Smelkov committed
882 883
}

884
// BackupRef represents 1 reference entry in 'backup.refs'   (repo prefix stripped)
Kirill Smelkov's avatar
Kirill Smelkov committed
885
type BackupRef struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
886 887
	name string // reference name without "refs/" prefix
	BackupRefSha1
Kirill Smelkov's avatar
Kirill Smelkov committed
888 889 890 891 892 893 894
}

// {} refname -> sha1, sha1_
type RefMap map[string]BackupRefSha1

// info about a repository from backup.refs
type BackupRepo struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
895 896
	repopath string // full repo path with backup prefix
	refs     RefMap
Kirill Smelkov's avatar
Kirill Smelkov committed
897 898 899 900
}

// all RefMap values as flat []BackupRef
func (m RefMap) Values() []BackupRef {
Kirill Smelkov's avatar
Kirill Smelkov committed
901 902 903 904 905
	ev := make([]BackupRef, 0, len(m))
	for ref, refsha1 := range m {
		ev = append(ev, BackupRef{ref, refsha1})
	}
	return ev
Kirill Smelkov's avatar
Kirill Smelkov committed
906 907 908 909 910 911 912
}

// for sorting []BackupRef by refname
type ByRefname []BackupRef

func (br ByRefname) Len() int           { return len(br) }
func (br ByRefname) Swap(i, j int)      { br[i], br[j] = br[j], br[i] }
913
func (br ByRefname) Less(i, j int) bool { return strings.Compare(br[i].name, br[j].name) < 0 }
Kirill Smelkov's avatar
Kirill Smelkov committed
914 915 916

// all sha1 heads RefMap points to, in sorted order
func (m RefMap) Sha1Heads() []Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
917 918 919 920 921 922 923
	hs := Sha1Set{}
	for _, refsha1 := range m {
		hs.Add(refsha1.sha1)
	}
	headv := hs.Elements()
	sort.Sort(BySha1(headv))
	return headv
Kirill Smelkov's avatar
Kirill Smelkov committed
924 925 926 927
}

// like Sha1Heads() but returns heads in text format delimited by "\n"
func (m RefMap) Sha1HeadsStr() string {
Kirill Smelkov's avatar
Kirill Smelkov committed
928 929 930 931 932
	s := ""
	for _, sha1 := range m.Sha1Heads() {
		s += sha1.String() + "\n"
	}
	return s
Kirill Smelkov's avatar
Kirill Smelkov committed
933 934 935 936 937 938 939 940 941 942 943
}

// for sorting []BackupRepo by repopath
type ByRepoPath []*BackupRepo

func (br ByRepoPath) Len() int           { return len(br) }
func (br ByRepoPath) Swap(i, j int)      { br[i], br[j] = br[j], br[i] }
func (br ByRepoPath) Less(i, j int) bool { return strings.Compare(br[i].repopath, br[j].repopath) < 0 }

// also for searching sorted []BackupRepo by repopath prefix
func (br ByRepoPath) Search(prefix string) int {
Kirill Smelkov's avatar
Kirill Smelkov committed
944 945 946
	return sort.Search(len(br), func(i int) bool {
		return strings.Compare(br[i].repopath, prefix) >= 0
	})
Kirill Smelkov's avatar
Kirill Smelkov committed
947 948
}

949 950
// request to extract a pack
type PackExtractReq struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
951 952
	refs     RefMap // extract pack with objects from this heads
	repopath string // into repository located here
953

Kirill Smelkov's avatar
Kirill Smelkov committed
954 955
	// for info only: request was generated restoring from under this backup prefix
	prefix string
956 957
}

958 959
func cmd_restore_(ctx context.Context, gb *git.Repository, HEAD_ string, restorespecv []RestoreSpec) {
	HEAD := xgitSha1(ctx, "rev-parse", "--verify", HEAD_)
Kirill Smelkov's avatar
Kirill Smelkov committed
960 961

	// read backup refs index
962
	repotab, err := loadBackupRefs(ctx, fmt.Sprintf("%s:backup.refs", HEAD))
Kirill Smelkov's avatar
Kirill Smelkov committed
963 964 965 966 967 968 969 970 971 972 973 974 975 976
	exc.Raiseif(err)

	// flattened & sorted repotab
	// NOTE sorted - to process repos always in the same order & for searching
	repov := make([]*BackupRepo, 0, len(repotab))
	for _, repo := range repotab {
		repov = append(repov, repo)
	}
	sort.Sort(ByRepoPath(repov))

	// repotab no longer needed
	repotab = nil

	packxq := make(chan PackExtractReq, 2*njobs) // requests to extract packs
977
	wg := xsync.NewWorkGroup(ctx)
Kirill Smelkov's avatar
Kirill Smelkov committed
978 979 980

	// main worker: walk over specified prefixes restoring files and
	// scheduling pack extraction requests from *.git -> packxq
981
	wg.Go(func(ctx context.Context) (err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
982
		defer close(packxq)
983
		// raised err -> return
Kirill Smelkov's avatar
Kirill Smelkov committed
984 985
		here := my.FuncName()
		defer exc.Catch(func(e *exc.Error) {
986
			err = exc.Addcallingcontext(here, e)
Kirill Smelkov's avatar
Kirill Smelkov committed
987 988 989 990 991 992 993 994 995 996
		})

		for _, __ := range restorespecv {
			prefix, dir := __.prefix, __.dir

			// ensure dir did not exist before restore run
			err := os.Mkdir(dir, 0777)
			exc.Raiseif(err)

			// files
997
			lstree := xgit(ctx, "ls-tree", "--full-tree", "-r", "-z", "--", HEAD, prefix, RunWith{raw: true})
Kirill Smelkov's avatar
Kirill Smelkov committed
998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
			repos_seen := StrSet{} // dirs of *.git seen while restoring files
			for _, __ := range xstrings.SplitLines(lstree, "\x00") {
				mode, type_, sha1, filename, err := parse_lstree_entry(__)
				// NOTE
				//  - `ls-tree -r` shows only leaf objects
				//  - git-backup repository does not have submodules and the like
				// -> type should be "blob" only
				if err != nil || type_ != "blob" {
					exc.Raisef("%s: invalid/unexpected ls-tree entry %q", HEAD, __)
				}

1009 1010
				exc.Raiseif(ctx.Err())

Kirill Smelkov's avatar
Kirill Smelkov committed
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
				filename = reprefix(prefix, dir, filename)
				infof("# file %s\t-> %s", prefix, filename)
				blob_to_file(gb, sha1, mode, filename)

				// make sure git will recognize *.git as repo:
				//   - it should have refs/{heads,tags}/ and objects/pack/ inside.
				//
				// NOTE doing it while restoring files, because a repo could be
				//   empty - without refs at all, and thus next "git packs restore"
				//   step will not be run for it.
				filedir := pathpkg.Dir(filename)
1022
				if strings.HasSuffix(filename, ".git/HEAD") && !repos_seen.Contains(filedir) {
Kirill Smelkov's avatar
Kirill Smelkov committed
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
					infof("# repo %s\t-> %s", prefix, filedir)
					for _, __ := range []string{"refs/heads", "refs/tags", "objects/pack"} {
						err := os.MkdirAll(filedir+"/"+__, 0777)
						exc.Raiseif(err)
					}
					repos_seen.Add(filedir)
				}
			}

			// git packs
			for i := ByRepoPath(repov).Search(prefix); i < len(repov); i++ {
				repo := repov[i]
				if !strings.HasPrefix(repo.repopath, prefix) {
					break // repov is sorted - end of repositories with prefix
				}

				// make sure tag/tree/blob objects represented as commits are
				// present, before we generate pack for restored repo.
				// ( such objects could be lost e.g. after backup repo repack as they
				//   are not reachable from backup repo HEAD )
				for _, __ := range repo.refs {
					if __.sha1 != __.sha1_ {
						obj_recreate_from_commit(gb, __.sha1_)
					}
				}

				select {
				case packxq <- PackExtractReq{refs: repo.refs,
					repopath: reprefix(prefix, dir, repo.repopath),
					prefix:   prefix}:

1054 1055
				case <-ctx.Done():
					return ctx.Err()
Kirill Smelkov's avatar
Kirill Smelkov committed
1056 1057 1058
				}
			}
		}
1059 1060 1061

		return nil
	})
Kirill Smelkov's avatar
Kirill Smelkov committed
1062 1063 1064

	// pack workers: packxq -> extract packs
	for i := 0; i < njobs; i++ {
1065 1066
		wg.Go(func(ctx context.Context) (err error) {
			// raised err -> return
Kirill Smelkov's avatar
Kirill Smelkov committed
1067 1068
			here := my.FuncName()
			defer exc.Catch(func(e *exc.Error) {
1069
				err = exc.Addcallingcontext(here, e)
Kirill Smelkov's avatar
Kirill Smelkov committed
1070 1071 1072 1073
			})

			for {
				select {
1074 1075
				case <-ctx.Done():
					return ctx.Err()
Kirill Smelkov's avatar
Kirill Smelkov committed
1076 1077 1078

				case p, ok := <-packxq:
					if !ok {
1079
						return nil
Kirill Smelkov's avatar
Kirill Smelkov committed
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
					}
					infof("# git  %s\t-> %s", p.prefix, p.repopath)

					// extract pack for that repo from big backup pack + decoded tags
					pack_argv := []string{
						"-c", "pack.threads=1", // occupy only 1 CPU + it packs better
						"pack-objects",
						"--revs", // include all objects referencable from input sha1 list
						"--reuse-object", "--reuse-delta", "--delta-base-offset",

						// use bitmap index from backup repo, if present (faster pack generation)
						// https://git.kernel.org/pub/scm/git/git.git/commit/?h=645c432d61
						"--use-bitmap-index",
					}
					if verbose <= 0 {
						pack_argv = append(pack_argv, "-q")
					}
					pack_argv = append(pack_argv, p.repopath+"/objects/pack/pack")

1099
					xgit2(ctx, pack_argv, RunWith{stdin: p.refs.Sha1HeadsStr(), stderr: gitprogress()})
Kirill Smelkov's avatar
Kirill Smelkov committed
1100 1101

					// verify that extracted repo refs match backup.refs index after extraction
1102
					x_ref_list := xgit(ctx, "--git-dir="+p.repopath,
Kirill Smelkov's avatar
Kirill Smelkov committed
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
						"for-each-ref", "--format=%(objectname) %(refname)")
					repo_refs := p.refs.Values()
					sort.Sort(ByRefname(repo_refs))
					repo_ref_listv := make([]string, 0, len(repo_refs))
					for _, ref := range repo_refs {
						repo_ref_listv = append(repo_ref_listv, fmt.Sprintf("%s refs/%s", ref.sha1, ref.name))
					}
					repo_ref_list := strings.Join(repo_ref_listv, "\n")
					if x_ref_list != repo_ref_list {
						// TODO show refs diff, not 2 dumps
						exc.Raisef("E: extracted %s refs corrupt:\n\nwant:\n%s\n\nhave:\n%s",
							p.repopath, repo_ref_list, x_ref_list)
					}

					// check connectivity in recreated repository.
					//
					// This way we verify that extracted pack indeed contains all
					// objects for all refs in the repo.
					//
					// Compared to fsck we do not re-compute sha1 sum of objects which
					// is significantly faster.
1124
					gerr, _, _ := ggit(ctx, "--git-dir="+p.repopath,
Kirill Smelkov's avatar
Kirill Smelkov committed
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
						"rev-list", "--objects", "--stdin", "--quiet", RunWith{stdin: p.refs.Sha1HeadsStr()})
					if gerr != nil {
						fmt.Fprintln(os.Stderr, "E: Problem while checking connectivity of extracted repo:")
						exc.Raise(gerr)
					}

					// XXX disabled because it is slow
					// // NOTE progress goes to stderr, problems go to stdout
					// xgit("--git-dir=" + p.repopath, "fsck",
					//         # only check that traversal from refs is ok: this unpacks
					//         # commits and trees and verifies blob objects are there,
					//         # but do _not_ unpack blobs =fast.
					//         "--connectivity-only",
					//         RunWith{stdout: gitprogress(), stderr: gitprogress()})
				}
			}
1141
		})
Kirill Smelkov's avatar
Kirill Smelkov committed
1142 1143
	}

1144 1145 1146
	// wait for workers to finish & collect/reraise first error, if any
	err = wg.Wait()
	exc.Raiseif(err)
Kirill Smelkov's avatar
Kirill Smelkov committed
1147 1148
}

1149 1150 1151
// loadBackupRefs loads 'backup.ref' content from a git object.
//
// an example of object is e.g. "HEAD:backup.ref".
1152
func loadBackupRefs(ctx context.Context, object string) (repotab map[string]*BackupRepo, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
1153 1154
	defer xerr.Contextf(&err, "load backup.refs %q", object)

1155
	gerr, backup_refs, _ := ggit(ctx, "cat-file", "blob", object)
Kirill Smelkov's avatar
Kirill Smelkov committed
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
	if gerr != nil {
		return nil, gerr
	}

	repotab = make(map[string]*BackupRepo)
	for _, refentry := range xstrings.SplitLines(backup_refs, "\n") {
		// sha1 prefix+refname (sha1_)
		badentry := func() error { return fmt.Errorf("invalid entry: %q", refentry) }
		refentryv := strings.Fields(refentry)
		if !(2 <= len(refentryv) && len(refentryv) <= 3) {
			return nil, badentry()
		}
		sha1, err := Sha1Parse(refentryv[0])
		sha1_, err_ := sha1, err
		if len(refentryv) == 3 {
			sha1_, err_ = Sha1Parse(refentryv[2])
		}
		if err != nil || err_ != nil {
			return nil, badentry()
		}
		reporef := refentryv[1]
		repopath, ref := reporef_split(reporef)

		repo := repotab[repopath]
		if repo == nil {
			repo = &BackupRepo{repopath, RefMap{}}
			repotab[repopath] = repo
		}

		if _, alreadyin := repo.refs[ref]; alreadyin {
			return nil, fmt.Errorf("duplicate ref %q", ref)
		}
		repo.refs[ref] = BackupRefSha1{sha1, sha1_}
	}

	return repotab, nil
1192 1193
}

1194
var commands = map[string]func(context.Context, *git.Repository, []string){
Kirill Smelkov's avatar
Kirill Smelkov committed
1195 1196
	"pull":    cmd_pull,
	"restore": cmd_restore,
Kirill Smelkov's avatar
Kirill Smelkov committed
1197 1198 1199
}

func usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
1200
	fmt.Fprintf(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
`git-backup [options] <command>

    pull        pull git-repositories and files to backup
    restore     restore git-repositories and files from backup

  common options:

    -h --help       this help text.
    -v              increase verbosity.
    -q              decrease verbosity.
1211 1212
    -j N            allow max N jobs to spawn; default=NPROC (%d on this system)
`, njobs)
Kirill Smelkov's avatar
Kirill Smelkov committed
1213 1214 1215
}

func main() {
Kirill Smelkov's avatar
Kirill Smelkov committed
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250
	flag.Usage = usage
	quiet := 0
	flag.Var((*xflag.Count)(&verbose), "v", "verbosity level")
	flag.Var((*xflag.Count)(&quiet), "q", "decrease verbosity")
	flag.IntVar(&njobs, "j", njobs, "allow max N jobs to spawn")
	flag.Parse()
	verbose -= quiet
	argv := flag.Args()

	if len(argv) == 0 {
		usage()
		os.Exit(1)
	}

	cmd := commands[argv[0]]
	if cmd == nil {
		fmt.Fprintf(os.Stderr, "E: unknown command %q", argv[0])
		os.Exit(1)
	}

	// catch Error and report info from it
	here := my.FuncName()
	defer exc.Catch(func(e *exc.Error) {
		e = exc.Addcallingcontext(here, e)
		fmt.Fprintln(os.Stderr, e)

		// also show traceback if debug
		if verbose > 2 {
			fmt.Fprint(os.Stderr, "\n")
			debug.PrintStack()
		}

		os.Exit(1)
	})

1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
	// cancel what we'll do on SIGINT | SIGTERM
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()
	sigq := make(chan os.Signal, 1)
	signal.Notify(sigq, os.Interrupt, syscall.SIGTERM)
	go func() {
		select {
		case <-ctx.Done():
		case <-sigq:
			cancel()
		}
	}()

Kirill Smelkov's avatar
Kirill Smelkov committed
1264 1265 1266 1267
	// backup repository
	gb, err := git.OpenRepository(".")
	exc.Raiseif(err)

1268
	cmd(ctx, gb, argv[1:])
Kirill Smelkov's avatar
Kirill Smelkov committed
1269
}