git-backup.go 40.2 KB
Newer Older
1
// Copyright (C) 2015-2020  Nexedi SA and Contributors.
Kirill Smelkov's avatar
Kirill Smelkov committed
2 3 4 5 6 7
//                          Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
8 9 10 11 12 13
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
Kirill Smelkov's avatar
Kirill Smelkov committed
14 15 16 17
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
18
// See https://www.nexedi.com/licensing for rationale and options.
Kirill Smelkov's avatar
Kirill Smelkov committed
19 20

/*
21
Git-backup - Backup set of Git repositories & just files; efficiently.
Kirill Smelkov's avatar
Kirill Smelkov committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

This program backups files and set of bare Git repositories into one Git repository.
Files are copied to blobs and then added to tree under certain place, and for
Git repositories, all reachable objects are pulled in with maintaining index
which remembers reference -> sha1 for every pulled repositories.

After objects from backuped Git repositories are pulled in, we create new
commit which references tree with changed backup index and files, and also has
all head objects from pulled-in repositories in its parents(*). This way backup
has history and all pulled objects become reachable from single head commit in
backup repository. In particular that means that the whole state of backup can
be described with only single sha1, and that backup repository itself could be
synchronized via standard git pull/push, be repacked, etc.

Restoration process is the opposite - from a particular backup state, files are
extracted at a proper place, and for Git repositories a pack with all objects
reachable from that repository heads is prepared and extracted from backup
repository object database.

This approach allows to leverage Git's good ability for object contents
deduplication and packing, especially for cases when there are many hosted
repositories which are forks of each other with relatively minor changes in
between each other and over time, and mostly common base. In author experience
the size of backup is dramatically smaller compared to straightforward "let's
tar it all" approach.

Data for all backuped files and repositories can be accessed if one has access
to backup repository, so either they all should be in the same security domain,
or extra care has to be taken to protect access to backup repository.

File permissions are not managed with strict details due to inherent
nature of Git. This aspect can be improved with e.g. etckeeper-like
(http://etckeeper.branchable.com/) approach if needed.

Please see README.rst with user-level overview on how to use git-backup.

NOTE the idea of pulling all refs together is similar to git-namespaces
     http://git-scm.com/docs/gitnamespaces

(*) Tag objects are handled specially - because in a lot of places Git insists and
    assumes commit parents can only be commit objects. We encode tag objects in
    specially-crafted commit object on pull, and decode back on backup restore.

    We do likewise if a ref points to tree or blob, which is valid in Git.
*/
package main

import (
Kirill Smelkov's avatar
Kirill Smelkov committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
	"flag"
	"fmt"
	"io/ioutil"
	"os"
	pathpkg "path"
	"path/filepath"
	"runtime"
	"runtime/debug"
	"sort"
	"strings"
	"sync"
	"syscall"
	"time"

	"lab.nexedi.com/kirr/go123/exc"
	"lab.nexedi.com/kirr/go123/mem"
	"lab.nexedi.com/kirr/go123/my"
	"lab.nexedi.com/kirr/go123/xerr"
	"lab.nexedi.com/kirr/go123/xflag"
	"lab.nexedi.com/kirr/go123/xstrings"

	git "github.com/libgit2/git2go"
Kirill Smelkov's avatar
Kirill Smelkov committed
92 93 94 95 96 97 98 99 100 101
)

// verbose output
// 0 - silent
// 1 - info
// 2 - progress of long-running operations
// 3 - debug
var verbose = 1

func infof(format string, a ...interface{}) {
Kirill Smelkov's avatar
Kirill Smelkov committed
102 103 104 105
	if verbose > 0 {
		fmt.Printf(format, a...)
		fmt.Println()
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
106 107 108 109 110
}

// what to pass to git subprocess to stdout/stderr
// DontRedirect - no-redirection, PIPE - output to us
func gitprogress() StdioRedirect {
Kirill Smelkov's avatar
Kirill Smelkov committed
111 112 113 114
	if verbose > 1 {
		return DontRedirect
	}
	return PIPE
Kirill Smelkov's avatar
Kirill Smelkov committed
115 116 117
}

func debugf(format string, a ...interface{}) {
Kirill Smelkov's avatar
Kirill Smelkov committed
118 119 120 121
	if verbose > 2 {
		fmt.Printf(format, a...)
		fmt.Println()
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
122 123
}

124 125 126
// how many max jobs to spawn
var njobs = runtime.NumCPU()

127
// -------- create/extract blob --------
Kirill Smelkov's avatar
Kirill Smelkov committed
128 129

// file -> blob_sha1, mode
130
func file_to_blob(g *git.Repository, path string) (Sha1, uint32) {
Kirill Smelkov's avatar
Kirill Smelkov committed
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
	var blob_content []byte

	// because we want to pass mode to outside world (to e.g. `git update-index`)
	// we need to get native OS mode, not translated one as os.Lstat() would give us.
	var st syscall.Stat_t
	err := syscall.Lstat(path, &st)
	if err != nil {
		exc.Raise(&os.PathError{"lstat", path, err})
	}

	if st.Mode&syscall.S_IFMT == syscall.S_IFLNK {
		__, err := os.Readlink(path)
		blob_content = mem.Bytes(__)
		exc.Raiseif(err)
	} else {
		blob_content, err = ioutil.ReadFile(path)
		exc.Raiseif(err)
	}

	blob_sha1, err := WriteObject(g, blob_content, git.ObjectBlob)
	exc.Raiseif(err)

	return blob_sha1, st.Mode
Kirill Smelkov's avatar
Kirill Smelkov committed
154 155 156
}

// blob_sha1, mode -> file
157
func blob_to_file(g *git.Repository, blob_sha1 Sha1, mode uint32, path string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	blob, err := ReadObject(g, blob_sha1, git.ObjectBlob)
	exc.Raiseif(err)
	blob_content := blob.Data()

	err = os.MkdirAll(pathpkg.Dir(path), 0777)
	exc.Raiseif(err)

	if mode&syscall.S_IFMT == syscall.S_IFLNK {
		err = os.Symlink(mem.String(blob_content), path)
		exc.Raiseif(err)
	} else {
		// NOTE mode is native - we cannot use ioutil.WriteFile() directly
		err = writefile(path, blob_content, mode)
		exc.Raiseif(err)
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
173 174 175 176 177 178 179 180 181 182 183 184 185 186
}

// -------- tags representation --------

// represent tag/tree/blob as specially crafted commit
//
// The reason we do this is that we want refs/tag/* to be parents of synthetic
// backup commit, but git does not allow tag objects to be in commit parents.
// Also besides commit and tag, it is possible for a ref to point to a tree or blob.
//
// We always attach original tagged object to crafted commit in one way or
// another, so that on backup restore we only have to recreate original tag
// object and tagged object is kept there in repo thanks to it being reachable
// through created commit.
187
func obj_represent_as_commit(g *git.Repository, sha1 Sha1, obj_type git.ObjectType) Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	switch obj_type {
	case git.ObjectTag, git.ObjectTree, git.ObjectBlob:
		// ok
	default:
		exc.Raisef("%s (%s): cannot encode as commit", sha1, obj_type)
	}

	// first line in commit msg = object type
	obj_encoded := gittypestr(obj_type) + "\n"
	var tagged_type git.ObjectType
	var tagged_sha1 Sha1

	// below the code layout is mainly for tag type, and we hook tree and blob
	// types handling into that layout
	if obj_type == git.ObjectTag {
		tag, tag_obj := xload_tag(g, sha1)
		tagged_type = tag.tagged_type
		tagged_sha1 = tag.tagged_sha1
		obj_encoded += mem.String(tag_obj.Data())
	} else {
		// for tree/blob we only care that object stays reachable
		tagged_type = obj_type
		tagged_sha1 = sha1
	}

	// all commits we do here - we do with fixed name/date, so transformation
	// tag->commit is stable wrt git environment and time change
	fixed := AuthorInfo{Name: "Git backup", Email: "git@backup.org", When: time.Unix(0, 0).UTC()}
	zcommit_tree := func(tree Sha1, parents []Sha1, msg string) Sha1 {
		return xcommit_tree2(g, tree, parents, msg, fixed, fixed)
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> ø
	// Commit             .parent -> Commit
	if tagged_type == git.ObjectCommit {
		return zcommit_tree(mktree_empty(), []Sha1{tagged_sha1}, obj_encoded)
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> Tree
	// Tree               .parent -> ø
	if tagged_type == git.ObjectTree {
		return zcommit_tree(tagged_sha1, []Sha1{}, obj_encoded)
	}

	// Tag        ~>     Commit*
	//  |                 .msg:      Tag
	//  v                 .tree   -> Tree* "tagged" -> Blob
	// Blob               .parent -> ø
	if tagged_type == git.ObjectBlob {
		tree_for_blob := xgitSha1("mktree", RunWith{stdin: fmt.Sprintf("100644 blob %s\ttagged\n", tagged_sha1)})
		return zcommit_tree(tree_for_blob, []Sha1{}, obj_encoded)
	}

	// Tag₂       ~>     Commit₂*
	//  |                 .msg:      Tag₂
	//  v                 .tree   -> ø
	// Tag₁               .parent -> Commit₁*
	if tagged_type == git.ObjectTag {
		commit1 := obj_represent_as_commit(g, tagged_sha1, tagged_type)
		return zcommit_tree(mktree_empty(), []Sha1{commit1}, obj_encoded)
	}

	exc.Raisef("%s (%q): unknown tagged type", sha1, tagged_type)
	panic(0)
Kirill Smelkov's avatar
Kirill Smelkov committed
256 257 258 259
}

// recreate tag/tree/blob from specially crafted commit
// (see obj_represent_as_commit() about how a objects are originally translated into commit)
260 261 262 263
// returns:
//   - tag:       recreated object sha1
//   - tree/blob: null sha1
func obj_recreate_from_commit(g *git.Repository, commit_sha1 Sha1) Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
	xraise := func(info interface{}) { exc.Raise(&RecreateObjError{commit_sha1, info}) }
	xraisef := func(f string, a ...interface{}) { xraise(fmt.Sprintf(f, a...)) }

	commit, err := g.LookupCommit(commit_sha1.AsOid())
	if err != nil {
		xraise(err)
	}
	if commit.ParentCount() > 1 {
		xraise(">1 parents")
	}

	obj_type, obj_raw, err := xstrings.HeadTail(commit.Message(), "\n")
	if err != nil {
		xraise("invalid encoded format")
	}
	switch obj_type {
	case "tag", "tree", "blob":
		// ok
	default:
		xraisef("unexpected encoded object type %q", obj_type)
	}

	// for tree/blob we do not need to do anything - that objects were reachable
	// from commit and are present in git db.
	if obj_type == "tree" || obj_type == "blob" {
		return Sha1{}
	}

	// re-create tag object
	tag_sha1, err := WriteObject(g, mem.Bytes(obj_raw), git.ObjectTag)
	exc.Raiseif(err)

	// the original tagged object should be already in repository, because we
	// always attach it to encoding commit one way or another,
	// except we need to recurse, if it was Tag₂->Tag₁
	tag, err := tag_parse(obj_raw)
	if err != nil {
		xraisef("encoded tag: %s", err)
	}
	if tag.tagged_type == git.ObjectTag {
		if commit.ParentCount() == 0 {
			xraise("encoded tag corrupt (tagged is tag but []parent is empty)")
		}
		obj_recreate_from_commit(g, Sha1FromOid(commit.ParentId(0)))
	}

	return tag_sha1
Kirill Smelkov's avatar
Kirill Smelkov committed
311 312 313
}

type RecreateObjError struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
314 315
	commit_sha1 Sha1
	info        interface{}
Kirill Smelkov's avatar
Kirill Smelkov committed
316 317 318
}

func (e *RecreateObjError) Error() string {
Kirill Smelkov's avatar
Kirill Smelkov committed
319
	return fmt.Sprintf("commit %s: %s", e.commit_sha1, e.info)
Kirill Smelkov's avatar
Kirill Smelkov committed
320 321 322 323 324
}

// -------- git-backup pull --------

func cmd_pull_usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
325
	fmt.Fprint(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
326 327 328 329 330 331 332 333
`git-backup pull <dir1>:<prefix1> <dir2>:<prefix2> ...

Pull bare Git repositories & just files from dir1 into backup prefix1,
from dir2 into backup prefix2, etc...
`)
}

type PullSpec struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
334
	dir, prefix string
Kirill Smelkov's avatar
Kirill Smelkov committed
335 336
}

337
func cmd_pull(gb *git.Repository, argv []string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
	flags := flag.FlagSet{Usage: cmd_pull_usage}
	flags.Init("", flag.ExitOnError)
	flags.Parse(argv)

	argv = flags.Args()
	if len(argv) < 1 {
		cmd_pull_usage()
		os.Exit(1)
	}

	pullspecv := []PullSpec{}
	for _, arg := range argv {
		dir, prefix, err := xstrings.Split2(arg, ":")
		if err != nil {
			fmt.Fprintf(os.Stderr, "E: invalid pullspec %q\n", arg)
			cmd_pull_usage()
			os.Exit(1)
		}

		pullspecv = append(pullspecv, PullSpec{dir, prefix})
	}

	cmd_pull_(gb, pullspecv)
Kirill Smelkov's avatar
Kirill Smelkov committed
361 362
}

363
// Ref is info about a reference pointing to sha1.
Kirill Smelkov's avatar
Kirill Smelkov committed
364
type Ref struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
365 366
	name string // reference name without "refs/" prefix
	sha1 Sha1
Kirill Smelkov's avatar
Kirill Smelkov committed
367 368
}

369
func cmd_pull_(gb *git.Repository, pullspecv []PullSpec) {
Kirill Smelkov's avatar
Kirill Smelkov committed
370 371 372 373 374
	// while pulling, we'll keep refs from all pulled repositories under temp
	// unique work refs namespace.
	backup_time := time.Now().Format("20060102-1504")               // %Y%m%d-%H%M
	backup_refs_work := fmt.Sprintf("refs/backup/%s/", backup_time) // refs/backup/20150820-2109/

375 376
	// prevent another `git-backup pull` from running simultaneously
	backup_lock := "refs/backup.locked"
Kirill Smelkov's avatar
Kirill Smelkov committed
377
	xgit("update-ref", backup_lock, mktree_empty(), Sha1{})
378
	defer xgit("update-ref", "-d", backup_lock)
Kirill Smelkov's avatar
Kirill Smelkov committed
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657

	// make sure there is root commit
	var HEAD Sha1
	var err error
	gerr, __, _ := ggit("rev-parse", "--verify", "HEAD")
	if gerr != nil {
		infof("# creating root commit")
		// NOTE `git commit` does not work in bare repo - do commit by hand
		HEAD = xcommit_tree(gb, mktree_empty(), []Sha1{}, "Initialize git-backup repository")
		xgit("update-ref", "-m", "git-backup pull init", "HEAD", HEAD)
	} else {
		HEAD, err = Sha1Parse(__)
		exc.Raiseif(err)
	}

	// build index of "already-have" objects: all commits + tag/tree/blob that
	// were at heads of already pulled repositories.
	//
	// Build it once and use below to check ourselves whether a head from a pulled
	// repository needs to be actually fetched. If we don't, `git fetch-pack`
	// will do similar to "all commits" linear scan for every pulled repository,
	// which are many out there.
	alreadyHave := Sha1Set{}
	infof("# building \"already-have\" index")

	// already have: all commits
	//
	// As of lab.nexedi.com/20180612 there are ~ 1.7·10⁷ objects total in backup.
	// Of those there are ~ 1.9·10⁶ commit objects, i.e. ~10% of total.
	// Since 1 sha1 is 2·10¹ bytes, the space needed for keeping sha1 of all
	// commits is ~ 4·10⁷B = ~40MB. It is thus ok to keep this index in RAM for now.
	for _, __ := range xstrings.SplitLines(xgit("rev-list", HEAD), "\n") {
		sha1, err := Sha1Parse(__)
		exc.Raiseif(err)
		alreadyHave.Add(sha1)
	}

	// already have: tag/tree/blob that were at heads of already pulled repositories
	//
	// As of lab.nexedi.com/20180612 there are ~ 8.4·10⁴ refs in total.
	// Of those encoded tag/tree/blob are ~ 3.2·10⁴, i.e. ~40% of total.
	// The number of tag/tree/blob objects in alreadyHave is thus negligible
	// compared to the number of "all commits".
	hcommit, err := gb.LookupCommit(HEAD.AsOid())
	exc.Raiseif(err)
	htree, err := hcommit.Tree()
	exc.Raiseif(err)
	if htree.EntryByName("backup.refs") != nil {
		repotab, err := loadBackupRefs(fmt.Sprintf("%s:backup.refs", HEAD))
		exc.Raiseif(err)

		for _, repo := range repotab {
			for _, xref := range repo.refs {
				if xref.sha1 != xref.sha1_ && !alreadyHave.Contains(xref.sha1) {
					// make sure encoded tag/tree/blob objects represented as
					// commits are present. We do so, because we promise to
					// fetch that all objects in alreadyHave are present.
					obj_recreate_from_commit(gb, xref.sha1_)

					alreadyHave.Add(xref.sha1)
				}
			}
		}
	}

	// walk over specified dirs, pulling objects from git and blobbing non-git-object files
	blobbedv := []string{} // info about file pulled to blob, and not yet added to index
	for _, __ := range pullspecv {
		dir, prefix := __.dir, __.prefix

		// make sure index is empty for prefix (so that we start from clean
		// prefix namespace and this way won't leave stale removed things)
		xgit("rm", "--cached", "-r", "--ignore-unmatch", "--", prefix)

		here := my.FuncName()
		err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) (errout error) {
			if err != nil {
				if os.IsNotExist(err) {
					// a file or directory was removed in parallel to us scanning the tree.
					infof("Warning: Skipping %s: %s", path, err)
					return nil
				}
				// any other error -> stop
				return err
			}

			// propagate exceptions properly via filepath.Walk as errors with calling context
			// (filepath is not our code)
			defer exc.Catch(func(e *exc.Error) {
				errout = exc.Addcallingcontext(here, e)
			})

			// files -> blobs + queue info for adding blobs to index
			if !info.IsDir() {
				infof("# file %s\t<- %s", prefix, path)
				blob, mode := file_to_blob(gb, path)
				blobbedv = append(blobbedv,
					fmt.Sprintf("%o %s\t%s", mode, blob, reprefix(dir, prefix, path)))
				return nil
			}

			// directories -> look for *.git and handle git object specially.

			// do not recurse into *.git/objects/  - we'll save them specially
			if strings.HasSuffix(path, ".git/objects") {
				return filepath.SkipDir
			}

			// else we recurse, but handle *.git specially - via fetching objects from it
			if !strings.HasSuffix(path, ".git") {
				return nil
			}

			// git repo - let's pull all refs from it to our backup refs namespace
			infof("# git  %s\t<- %s", prefix, path)
			refv, _, err := fetch(path, alreadyHave)
			exc.Raiseif(err)

			// TODO don't store to git references all references from fetched repository:
			//
			//      We need to store to git references only references that were actually
			//      fetched - so that next fetch, e.g. from a fork that also has new data
			//      as its upstream, won't have to transfer what we just have fetched
			//      from upstream.
			//
			//      For this purpose we can also save references by naming them as their
			//      sha1, not actual name, which will automatically deduplicate them in
			//      between several repositories, especially when/if pull will be made to
			//      work in parallel.
			//
			//      Such changed-only deduplicated references should be O(δ) - usually only
			//      a few, and this way we will also automatically avoid O(n^2) behaviour
			//      of every git fetch scanning all local references at its startup.
			//
			//      For backup.refs, we can generate it directly from refv of all fetched
			//      repositories saved in RAM.
			reporefprefix := backup_refs_work +
				// NOTE repo name is escaped as it can contain e.g. spaces, and refs must not
				path_refescape(reprefix(dir, prefix, path))
			for _, ref := range refv {
				err = mkref(gb, reporefprefix+"/"+ref.name, ref.sha1)
				exc.Raiseif(err)
			}

			// XXX do we want to do full fsck of source git repo on pull as well ?

			return nil
		})

		// re-raise / raise error after Walk
		if err != nil {
			e := exc.Aserror(err)
			e = exc.Addcontext(e, "pulling from "+dir)
			exc.Raise(e)
		}
	}

	// add to index files we converted to blobs
	xgit("update-index", "--add", "--index-info", RunWith{stdin: strings.Join(blobbedv, "\n")})

	// all refs from all found git repositories populated.
	// now prepare manifest with ref -> sha1 and do a synthetic commit merging all that sha1
	// (so they become all reachable from HEAD -> survive repack and be transferable on git pull)
	//
	// NOTE we handle tag/tree/blob objects specially - because these objects cannot
	// be in commit parents, we convert them to specially-crafted commits and use them.
	// The commits prepared contain full info how to restore original objects.

	// backup.refs format:
	//
	//   1eeb0324 <prefix>/wendelin.core.git/heads/master
	//   213a9243 <prefix>/wendelin.core.git/tags/v0.4 <213a9243-converted-to-commit>
	//   ...
	//
	// NOTE `git for-each-ref` sorts output by ref
	//      -> backup_refs is sorted and stable between runs
	backup_refs_dump := xgit("for-each-ref", backup_refs_work)
	backup_refs_list := []Ref{}       // parsed dump
	backup_refsv := []string{}        // backup.refs content
	backup_refs_parents := Sha1Set{}  // sha1 for commit parents, obtained from refs
	noncommit_seen := map[Sha1]Sha1{} // {} sha1 -> sha1_ (there are many duplicate tags)
	for _, __ := range xstrings.SplitLines(backup_refs_dump, "\n") {
		sha1, type_, ref := Sha1{}, "", ""
		_, err := fmt.Sscanf(__, "%s %s %s\n", &sha1, &type_, &ref)
		if err != nil {
			exc.Raisef("%s: strange for-each-ref entry %q", backup_refs_work, __)
		}
		backup_refs_list = append(backup_refs_list, Ref{ref, sha1})
		backup_refs_entry := fmt.Sprintf("%s %s", sha1, strip_prefix(backup_refs_work, ref))

		// represent tag/tree/blob as specially crafted commit, because we
		// cannot use it as commit parent.
		sha1_ := sha1
		if type_ != "commit" {
			//infof("obj_as_commit %s  %s\t%s", sha1, type_, ref)  XXX
			var seen bool
			sha1_, seen = noncommit_seen[sha1]
			if !seen {
				obj_type, ok := gittype(type_)
				if !ok {
					exc.Raisef("%s: invalid git type in entry %q", backup_refs_work, __)
				}
				sha1_ = obj_represent_as_commit(gb, sha1, obj_type)
				noncommit_seen[sha1] = sha1_
			}

			backup_refs_entry += fmt.Sprintf(" %s", sha1_)
		}

		backup_refsv = append(backup_refsv, backup_refs_entry)

		if !backup_refs_parents.Contains(sha1_) { // several refs can refer to the same sha1
			backup_refs_parents.Add(sha1_)
		}
	}

	backup_refs := strings.Join(backup_refsv, "\n")
	backup_refs_parentv := backup_refs_parents.Elements()
	sort.Sort(BySha1(backup_refs_parentv)) // so parents order is stable in between runs

	// backup_refs -> blob
	backup_refs_sha1 := xgitSha1("hash-object", "-w", "--stdin", RunWith{stdin: backup_refs})

	// add backup_refs blob to index
	xgit("update-index", "--add", "--cacheinfo", fmt.Sprintf("100644,%s,backup.refs", backup_refs_sha1))

	// index is ready - prepare tree and commit
	backup_tree_sha1 := xgitSha1("write-tree")
	commit_sha1 := xcommit_tree(gb, backup_tree_sha1, append([]Sha1{HEAD}, backup_refs_parentv...),
		"Git-backup "+backup_time)

	xgit("update-ref", "-m", "git-backup pull", "HEAD", commit_sha1, HEAD)

	// remove no-longer needed backup refs & verify they don't stay
	backup_refs_delete := ""
	for _, ref := range backup_refs_list {
		backup_refs_delete += fmt.Sprintf("delete %s %s\n", ref.name, ref.sha1)
	}

	xgit("update-ref", "--stdin", RunWith{stdin: backup_refs_delete})
	__ = xgit("for-each-ref", backup_refs_work)
	if __ != "" {
		exc.Raisef("Backup refs under %s not deleted properly", backup_refs_work)
	}

	// NOTE  `delete` deletes only files, but leaves empty dirs around.
	//       more important: this affect performance of future `git-backup pull` run a *LOT*
	//
	//       reason is: `git pull` first check local refs, and for doing so it
	//       recourse into all directories, even empty ones.
	//
	//       https://lab.nexedi.com/lab.nexedi.com/lab.nexedi.com/issues/4
	//
	//       So remove all dirs under backup_refs_work prefix in the end.
	//
	// TODO  Revisit this when reworking fetch to be parallel. Reason is: in
	//       the process of pulling repositories, the more references we
	//       accumulate, the longer pull starts to be, so it becomes O(n^2).
	//
	//       -> what to do is described nearby fetch/mkref call.
	gitdir := xgit("rev-parse", "--git-dir")
	err = os.RemoveAll(gitdir + "/" + backup_refs_work)
	exc.Raiseif(err) // NOTE err is nil if path does not exist

	// if we have working copy - update it
	bare := xgit("rev-parse", "--is-bare-repository")
	if bare != "true" {
		// `git checkout-index -af`  -- does not delete deleted files
		// `git read-tree -v -u --reset HEAD~ HEAD`  -- needs index matching
		// original worktree to properly work, but we already have updated index
		//
		// so we get changes we committed as diff and apply to worktree
		diff := xgit("diff", "--binary", HEAD, "HEAD", RunWith{raw: true})
		if diff != "" {
			diffstat := xgit("apply", "--stat", "--apply", "--binary", "--whitespace=nowarn",
				RunWith{stdin: diff, raw: true})
			infof("%s", diffstat)
		}
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
658 659
}

660 661 662 663 664 665
// fetch makes sure all objects from a repository are present in backup place.
//
// It fetches objects that are potentially missing in backup from the
// repository in question. The objects considered to fetch are those, that are
// reachable from all repository references.
//
666 667 668 669 670 671 672 673 674 675 676 677 678
// AlreadyHave can be given to indicate knowledge on what objects our repository
// already has. If remote advertises tip with sha1 in alreadyHave, that tip won't be
// fetched. Notice: alreadyHave is consulted directly - no reachability scan is
// performed on it.
//
// All objects reachable from alreadyHave must be in our repository.
// AlreadyHave does not need to be complete - if we have something that is not
// in alreadyHave - it can affect only speed, not correctness.
//
// Returned are 2 lists of references from the source repository:
//
//  - list of all references, and
//  - list of references we actually had to fetch.
679 680 681
//
// Note: fetch does not create any local references - the references returned
// only describe state of references in fetched source repository.
682
func fetch(repo string, alreadyHave Sha1Set) (refv, fetchedv []Ref, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
	defer xerr.Contextf(&err, "fetch %s", repo)

	// first check which references are advertised
	refv, err = lsremote(repo)
	if err != nil {
		return nil, nil, err
	}

	// check if we already have something
	var fetchv []Ref // references we need to actually fetch.
	for _, ref := range refv {
		if !alreadyHave.Contains(ref.sha1) {
			fetchv = append(fetchv, ref)
		}
	}

	// if there is nothing to fetch - we are done
	if len(fetchv) == 0 {
		return refv, fetchv, nil
	}

	// fetch by sha1 what we don't already have from advertised.
	//
	// even if refs would change after ls-remote but before here, we should be
	// getting exactly what was advertised.
	//
	// related link on the subject:
	// https://git.kernel.org/pub/scm/git/git.git/commit/?h=051e4005a3
	var argv []interface{}
	arg := func(v ...interface{}) { argv = append(argv, v...) }
	arg(
		// check objects for corruption as they are fetched
		"-c", "fetch.fsckObjects=true",
		"fetch-pack", "--thin",

		// force upload-pack to allow us asking any sha1 we want.
		// needed because advertised refs we got at lsremote time could have changed.
		"--upload-pack=git -c uploadpack.allowAnySHA1InWant=true"+
			// workarounds for git < 2.11.1, which does not have uploadpack.allowAnySHA1InWant:
			" -c uploadpack.allowTipSHA1InWant=true -c uploadpack.allowReachableSHA1InWant=true"+
			//
			" upload-pack",

		repo)
	for _, ref := range fetchv {
		arg(ref.sha1)
	}
	arg(RunWith{stderr: gitprogress()})

	gerr, _, _ := ggit(argv...)
	if gerr != nil {
		return nil, nil, gerr
	}

	// fetch-pack ran ok - now check that all fetched tips are indeed fully
	// connected and that we also have all referenced blob/tree objects. The
	// reason for this check is that source repository could send us a pack with
	// e.g. some objects missing and this way even if fetch-pack would report
	// success, chances could be we won't have all the objects we think we
	// fetched.
	//
	// when checking we assume that the roots we already have at all our
	// references are ok.
	//
	// related link on the subject:
	// https://git.kernel.org/pub/scm/git/git.git/commit/?h=6d4bb3833c
	argv = nil
	arg("rev-list", "--quiet", "--objects", "--not", "--all", "--not")
	for _, ref := range fetchv {
		arg(ref.sha1)
	}
	arg(RunWith{stderr: gitprogress()})

	gerr, _, _ = ggit(argv...)
	if gerr != nil {
		return nil, nil, fmt.Errorf("remote did not send all neccessary objects")
	}

	// fetched ok
	return refv, fetchv, nil
763 764 765 766
}

// lsremote lists all references advertised by repo.
func lsremote(repo string) (refv []Ref, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
	defer xerr.Contextf(&err, "lsremote %s", repo)

	// NOTE --refs instructs to omit peeled refs like
	//
	//   c668db59ccc59e97ce81f769d9f4633e27ad3bdb refs/tags/v0.1
	//   4b6821f4a4e4c9648941120ccbab03982e33104f refs/tags/v0.1^{}  <--
	//
	// because fetch-pack errors on them:
	//
	//   https://public-inbox.org/git/20180610143231.7131-1-kirr@nexedi.com/
	//
	// we don't need to pull them anyway.
	gerr, stdout, _ := ggit("ls-remote", "--refs", repo)
	if gerr != nil {
		return nil, gerr
	}

	//  oid refname
	//  oid refname
	//  ...
	for _, entry := range xstrings.SplitLines(stdout, "\n") {
		sha1, ref := Sha1{}, ""
		_, err := fmt.Sscanf(entry, "%s %s\n", &sha1, &ref)
		if err != nil {
			return nil, fmt.Errorf("strange output entry: %q", entry)
		}

		// Ref says its name goes without "refs/" prefix.
		if !strings.HasPrefix(ref, "refs/") {
			return nil, fmt.Errorf("non-refs/ reference: %q", ref)
		}
		ref = strings.TrimPrefix(ref, "refs/")

		refv = append(refv, Ref{ref, sha1})
	}

	return refv, nil
804 805
}

Kirill Smelkov's avatar
Kirill Smelkov committed
806 807 808
// -------- git-backup restore --------

func cmd_restore_usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
809
	fmt.Fprint(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
810 811 812 813 814 815 816 817 818 819
`git-backup restore <commit-ish> <prefix1>:<dir1> <prefix2>:<dir2> ...

Restore Git repositories & just files from backup prefix1 into dir1,
from backup prefix2 into dir2, etc...

Backup state to restore is taken from <commit-ish>.
`)
}

type RestoreSpec struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
820
	prefix, dir string
Kirill Smelkov's avatar
Kirill Smelkov committed
821 822
}

823
func cmd_restore(gb *git.Repository, argv []string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
	flags := flag.FlagSet{Usage: cmd_restore_usage}
	flags.Init("", flag.ExitOnError)
	flags.Parse(argv)

	argv = flags.Args()
	if len(argv) < 2 {
		cmd_restore_usage()
		os.Exit(1)
	}

	HEAD := argv[0]

	restorespecv := []RestoreSpec{}
	for _, arg := range argv[1:] {
		prefix, dir, err := xstrings.Split2(arg, ":")
		if err != nil {
			fmt.Fprintf(os.Stderr, "E: invalid restorespec %q\n", arg)
			cmd_restore_usage()
			os.Exit(1)
		}

		restorespecv = append(restorespecv, RestoreSpec{prefix, dir})
	}

	cmd_restore_(gb, HEAD, restorespecv)
Kirill Smelkov's avatar
Kirill Smelkov committed
849 850 851 852 853
}

// kirr/wendelin.core.git/heads/master -> kirr/wendelin.core.git, heads/master
// tiwariayush/Discussion%20Forum%20.git/... -> tiwariayush/Discussion Forum .git, ...
func reporef_split(reporef string) (repo, ref string) {
Kirill Smelkov's avatar
Kirill Smelkov committed
854 855 856 857 858 859 860 861
	dotgit := strings.Index(reporef, ".git/")
	if dotgit == -1 {
		exc.Raisef("E: %s is not a ref for a git repo", reporef)
	}
	repo, ref = reporef[:dotgit+4], reporef[dotgit+4+1:]
	repo, err := path_refunescape(repo) // unescape repo name we originally escaped when making backup
	exc.Raiseif(err)
	return repo, ref
Kirill Smelkov's avatar
Kirill Smelkov committed
862 863 864 865
}

// sha1 value(s) for a ref in 'backup.refs'
type BackupRefSha1 struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
866 867 868
	sha1  Sha1 // original sha1 this ref was pointing to in original repo
	sha1_ Sha1 // sha1 actually used to represent sha1's object in backup repo
	           // (for tag/tree/blob - they are converted to commits)
Kirill Smelkov's avatar
Kirill Smelkov committed
869 870
}

871
// BackupRef represents 1 reference entry in 'backup.refs'   (repo prefix stripped)
Kirill Smelkov's avatar
Kirill Smelkov committed
872
type BackupRef struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
873 874
	name string // reference name without "refs/" prefix
	BackupRefSha1
Kirill Smelkov's avatar
Kirill Smelkov committed
875 876 877 878 879 880 881
}

// {} refname -> sha1, sha1_
type RefMap map[string]BackupRefSha1

// info about a repository from backup.refs
type BackupRepo struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
882 883
	repopath string // full repo path with backup prefix
	refs     RefMap
Kirill Smelkov's avatar
Kirill Smelkov committed
884 885 886 887
}

// all RefMap values as flat []BackupRef
func (m RefMap) Values() []BackupRef {
Kirill Smelkov's avatar
Kirill Smelkov committed
888 889 890 891 892
	ev := make([]BackupRef, 0, len(m))
	for ref, refsha1 := range m {
		ev = append(ev, BackupRef{ref, refsha1})
	}
	return ev
Kirill Smelkov's avatar
Kirill Smelkov committed
893 894 895 896 897 898 899
}

// for sorting []BackupRef by refname
type ByRefname []BackupRef

func (br ByRefname) Len() int           { return len(br) }
func (br ByRefname) Swap(i, j int)      { br[i], br[j] = br[j], br[i] }
900
func (br ByRefname) Less(i, j int) bool { return strings.Compare(br[i].name, br[j].name) < 0 }
Kirill Smelkov's avatar
Kirill Smelkov committed
901 902 903

// all sha1 heads RefMap points to, in sorted order
func (m RefMap) Sha1Heads() []Sha1 {
Kirill Smelkov's avatar
Kirill Smelkov committed
904 905 906 907 908 909 910
	hs := Sha1Set{}
	for _, refsha1 := range m {
		hs.Add(refsha1.sha1)
	}
	headv := hs.Elements()
	sort.Sort(BySha1(headv))
	return headv
Kirill Smelkov's avatar
Kirill Smelkov committed
911 912 913 914
}

// like Sha1Heads() but returns heads in text format delimited by "\n"
func (m RefMap) Sha1HeadsStr() string {
Kirill Smelkov's avatar
Kirill Smelkov committed
915 916 917 918 919
	s := ""
	for _, sha1 := range m.Sha1Heads() {
		s += sha1.String() + "\n"
	}
	return s
Kirill Smelkov's avatar
Kirill Smelkov committed
920 921 922 923 924 925 926 927 928 929 930
}

// for sorting []BackupRepo by repopath
type ByRepoPath []*BackupRepo

func (br ByRepoPath) Len() int           { return len(br) }
func (br ByRepoPath) Swap(i, j int)      { br[i], br[j] = br[j], br[i] }
func (br ByRepoPath) Less(i, j int) bool { return strings.Compare(br[i].repopath, br[j].repopath) < 0 }

// also for searching sorted []BackupRepo by repopath prefix
func (br ByRepoPath) Search(prefix string) int {
Kirill Smelkov's avatar
Kirill Smelkov committed
931 932 933
	return sort.Search(len(br), func(i int) bool {
		return strings.Compare(br[i].repopath, prefix) >= 0
	})
Kirill Smelkov's avatar
Kirill Smelkov committed
934 935
}

936 937
// request to extract a pack
type PackExtractReq struct {
Kirill Smelkov's avatar
Kirill Smelkov committed
938 939
	refs     RefMap // extract pack with objects from this heads
	repopath string // into repository located here
940

Kirill Smelkov's avatar
Kirill Smelkov committed
941 942
	// for info only: request was generated restoring from under this backup prefix
	prefix string
943 944
}

945
func cmd_restore_(gb *git.Repository, HEAD_ string, restorespecv []RestoreSpec) {
Kirill Smelkov's avatar
Kirill Smelkov committed
946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
	HEAD := xgitSha1("rev-parse", "--verify", HEAD_)

	// read backup refs index
	repotab, err := loadBackupRefs(fmt.Sprintf("%s:backup.refs", HEAD))
	exc.Raiseif(err)

	// flattened & sorted repotab
	// NOTE sorted - to process repos always in the same order & for searching
	repov := make([]*BackupRepo, 0, len(repotab))
	for _, repo := range repotab {
		repov = append(repov, repo)
	}
	sort.Sort(ByRepoPath(repov))

	// repotab no longer needed
	repotab = nil

	packxq := make(chan PackExtractReq, 2*njobs) // requests to extract packs
	errch := make(chan error)                    // errors from workers
	stopch := make(chan struct{})                // broadcasts restore has to be cancelled
	wg := sync.WaitGroup{}

	// main worker: walk over specified prefixes restoring files and
	// scheduling pack extraction requests from *.git -> packxq
	wg.Add(1)
	go func() {
		defer wg.Done()
		defer close(packxq)
		// raised err -> errch
		here := my.FuncName()
		defer exc.Catch(func(e *exc.Error) {
			errch <- exc.Addcallingcontext(here, e)
		})

	runloop:
		for _, __ := range restorespecv {
			prefix, dir := __.prefix, __.dir

			// ensure dir did not exist before restore run
			err := os.Mkdir(dir, 0777)
			exc.Raiseif(err)

			// files
			lstree := xgit("ls-tree", "--full-tree", "-r", "-z", "--", HEAD, prefix, RunWith{raw: true})
			repos_seen := StrSet{} // dirs of *.git seen while restoring files
			for _, __ := range xstrings.SplitLines(lstree, "\x00") {
				mode, type_, sha1, filename, err := parse_lstree_entry(__)
				// NOTE
				//  - `ls-tree -r` shows only leaf objects
				//  - git-backup repository does not have submodules and the like
				// -> type should be "blob" only
				if err != nil || type_ != "blob" {
					exc.Raisef("%s: invalid/unexpected ls-tree entry %q", HEAD, __)
				}

				filename = reprefix(prefix, dir, filename)
				infof("# file %s\t-> %s", prefix, filename)
				blob_to_file(gb, sha1, mode, filename)

				// make sure git will recognize *.git as repo:
				//   - it should have refs/{heads,tags}/ and objects/pack/ inside.
				//
				// NOTE doing it while restoring files, because a repo could be
				//   empty - without refs at all, and thus next "git packs restore"
				//   step will not be run for it.
				filedir := pathpkg.Dir(filename)
				if strings.HasSuffix(filedir, ".git") && !repos_seen.Contains(filedir) {
					infof("# repo %s\t-> %s", prefix, filedir)
					for _, __ := range []string{"refs/heads", "refs/tags", "objects/pack"} {
						err := os.MkdirAll(filedir+"/"+__, 0777)
						exc.Raiseif(err)
					}
					repos_seen.Add(filedir)
				}
			}

			// git packs
			for i := ByRepoPath(repov).Search(prefix); i < len(repov); i++ {
				repo := repov[i]
				if !strings.HasPrefix(repo.repopath, prefix) {
					break // repov is sorted - end of repositories with prefix
				}

				// make sure tag/tree/blob objects represented as commits are
				// present, before we generate pack for restored repo.
				// ( such objects could be lost e.g. after backup repo repack as they
				//   are not reachable from backup repo HEAD )
				for _, __ := range repo.refs {
					if __.sha1 != __.sha1_ {
						obj_recreate_from_commit(gb, __.sha1_)
					}
				}

				select {
				case packxq <- PackExtractReq{refs: repo.refs,
					repopath: reprefix(prefix, dir, repo.repopath),
					prefix:   prefix}:

				case <-stopch:
					break runloop
				}
			}
		}
	}()

	// pack workers: packxq -> extract packs
	for i := 0; i < njobs; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			// raised err -> errch
			here := my.FuncName()
			defer exc.Catch(func(e *exc.Error) {
				errch <- exc.Addcallingcontext(here, e)
			})

		runloop:
			for {
				select {
				case <-stopch:
					break runloop

				case p, ok := <-packxq:
					if !ok {
						break runloop
					}
					infof("# git  %s\t-> %s", p.prefix, p.repopath)

					// extract pack for that repo from big backup pack + decoded tags
					pack_argv := []string{
						"-c", "pack.threads=1", // occupy only 1 CPU + it packs better
						"pack-objects",
						"--revs", // include all objects referencable from input sha1 list
						"--reuse-object", "--reuse-delta", "--delta-base-offset",

						// use bitmap index from backup repo, if present (faster pack generation)
						// https://git.kernel.org/pub/scm/git/git.git/commit/?h=645c432d61
						"--use-bitmap-index",
					}
					if verbose <= 0 {
						pack_argv = append(pack_argv, "-q")
					}
					pack_argv = append(pack_argv, p.repopath+"/objects/pack/pack")

					xgit2(pack_argv, RunWith{stdin: p.refs.Sha1HeadsStr(), stderr: gitprogress()})

					// verify that extracted repo refs match backup.refs index after extraction
					x_ref_list := xgit("--git-dir="+p.repopath,
						"for-each-ref", "--format=%(objectname) %(refname)")
					repo_refs := p.refs.Values()
					sort.Sort(ByRefname(repo_refs))
					repo_ref_listv := make([]string, 0, len(repo_refs))
					for _, ref := range repo_refs {
						repo_ref_listv = append(repo_ref_listv, fmt.Sprintf("%s refs/%s", ref.sha1, ref.name))
					}
					repo_ref_list := strings.Join(repo_ref_listv, "\n")
					if x_ref_list != repo_ref_list {
						// TODO show refs diff, not 2 dumps
						exc.Raisef("E: extracted %s refs corrupt:\n\nwant:\n%s\n\nhave:\n%s",
							p.repopath, repo_ref_list, x_ref_list)
					}

					// check connectivity in recreated repository.
					//
					// This way we verify that extracted pack indeed contains all
					// objects for all refs in the repo.
					//
					// Compared to fsck we do not re-compute sha1 sum of objects which
					// is significantly faster.
					gerr, _, _ := ggit("--git-dir="+p.repopath,
						"rev-list", "--objects", "--stdin", "--quiet", RunWith{stdin: p.refs.Sha1HeadsStr()})
					if gerr != nil {
						fmt.Fprintln(os.Stderr, "E: Problem while checking connectivity of extracted repo:")
						exc.Raise(gerr)
					}

					// XXX disabled because it is slow
					// // NOTE progress goes to stderr, problems go to stdout
					// xgit("--git-dir=" + p.repopath, "fsck",
					//         # only check that traversal from refs is ok: this unpacks
					//         # commits and trees and verifies blob objects are there,
					//         # but do _not_ unpack blobs =fast.
					//         "--connectivity-only",
					//         RunWith{stdout: gitprogress(), stderr: gitprogress()})
				}
			}
		}()
	}

	// wait for workers to finish & collect/reraise their errors
	go func() {
		wg.Wait()
		close(errch)
	}()

	ev := xerr.Errorv{}
	for e := range errch {
		// tell everything to stop on first error
		if len(ev) == 0 {
			close(stopch)
		}
		ev = append(ev, e)
	}

	if len(ev) != 0 {
		exc.Raise(ev)
	}
Kirill Smelkov's avatar
Kirill Smelkov committed
1153 1154
}

1155 1156 1157 1158
// loadBackupRefs loads 'backup.ref' content from a git object.
//
// an example of object is e.g. "HEAD:backup.ref".
func loadBackupRefs(object string) (repotab map[string]*BackupRepo, err error) {
Kirill Smelkov's avatar
Kirill Smelkov committed
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197
	defer xerr.Contextf(&err, "load backup.refs %q", object)

	gerr, backup_refs, _ := ggit("cat-file", "blob", object)
	if gerr != nil {
		return nil, gerr
	}

	repotab = make(map[string]*BackupRepo)
	for _, refentry := range xstrings.SplitLines(backup_refs, "\n") {
		// sha1 prefix+refname (sha1_)
		badentry := func() error { return fmt.Errorf("invalid entry: %q", refentry) }
		refentryv := strings.Fields(refentry)
		if !(2 <= len(refentryv) && len(refentryv) <= 3) {
			return nil, badentry()
		}
		sha1, err := Sha1Parse(refentryv[0])
		sha1_, err_ := sha1, err
		if len(refentryv) == 3 {
			sha1_, err_ = Sha1Parse(refentryv[2])
		}
		if err != nil || err_ != nil {
			return nil, badentry()
		}
		reporef := refentryv[1]
		repopath, ref := reporef_split(reporef)

		repo := repotab[repopath]
		if repo == nil {
			repo = &BackupRepo{repopath, RefMap{}}
			repotab[repopath] = repo
		}

		if _, alreadyin := repo.refs[ref]; alreadyin {
			return nil, fmt.Errorf("duplicate ref %q", ref)
		}
		repo.refs[ref] = BackupRefSha1{sha1, sha1_}
	}

	return repotab, nil
1198 1199
}

1200
var commands = map[string]func(*git.Repository, []string){
Kirill Smelkov's avatar
Kirill Smelkov committed
1201 1202
	"pull":    cmd_pull,
	"restore": cmd_restore,
Kirill Smelkov's avatar
Kirill Smelkov committed
1203 1204 1205
}

func usage() {
Kirill Smelkov's avatar
Kirill Smelkov committed
1206
	fmt.Fprintf(os.Stderr,
Kirill Smelkov's avatar
Kirill Smelkov committed
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
`git-backup [options] <command>

    pull        pull git-repositories and files to backup
    restore     restore git-repositories and files from backup

  common options:

    -h --help       this help text.
    -v              increase verbosity.
    -q              decrease verbosity.
1217 1218
    -j N            allow max N jobs to spawn; default=NPROC (%d on this system)
`, njobs)
Kirill Smelkov's avatar
Kirill Smelkov committed
1219 1220 1221
}

func main() {
Kirill Smelkov's avatar
Kirill Smelkov committed
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
	flag.Usage = usage
	quiet := 0
	flag.Var((*xflag.Count)(&verbose), "v", "verbosity level")
	flag.Var((*xflag.Count)(&quiet), "q", "decrease verbosity")
	flag.IntVar(&njobs, "j", njobs, "allow max N jobs to spawn")
	flag.Parse()
	verbose -= quiet
	argv := flag.Args()

	if len(argv) == 0 {
		usage()
		os.Exit(1)
	}

	cmd := commands[argv[0]]
	if cmd == nil {
		fmt.Fprintf(os.Stderr, "E: unknown command %q", argv[0])
		os.Exit(1)
	}

	// catch Error and report info from it
	here := my.FuncName()
	defer exc.Catch(func(e *exc.Error) {
		e = exc.Addcallingcontext(here, e)
		fmt.Fprintln(os.Stderr, e)

		// also show traceback if debug
		if verbose > 2 {
			fmt.Fprint(os.Stderr, "\n")
			debug.PrintStack()
		}

		os.Exit(1)
	})

	// backup repository
	gb, err := git.OpenRepository(".")
	exc.Raiseif(err)

	cmd(gb, argv[1:])
Kirill Smelkov's avatar
Kirill Smelkov committed
1262
}