diff --git a/man/man8/funclatency.8 b/man/man8/funclatency.8
index 90eed0e2ff4ecf0ad757b4953baaf7e07a6b1d71..34dec66aa69e046f7da07732c24c0e0bcdc1170c 100644
--- a/man/man8/funclatency.8
+++ b/man/man8/funclatency.8
@@ -2,7 +2,7 @@
 .SH NAME
 funclatency \- Time kernel funcitons and print latency as a histogram.
 .SH SYNOPSIS
-.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-u] [\-m] [\-r] pattern
+.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-u] [\-m] [\-r] [\-F] pattern
 .SH DESCRIPTION
 This tool traces kernel function calls and times their duration (latency), and
 shows the latency distribution as a histogram. The time is measured from when
@@ -12,6 +12,11 @@ time and time spent blocked.
 This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
 for efficiency.
 
+Currently nested or recursive functions are not supported properly, and
+timestamps will be overwritten, creating dubious output. Try to match single
+functions, or groups of functions that run at the same stack layer, and
+don't ultimately call each other.
+
 WARNING: This uses dynamic tracing of (what can be many) kernel functions, an
 activity that has had issues on some kernel versions (risk of panics or
 freezes). Test, and know what you are doing, before use.
@@ -41,6 +46,9 @@ Output histogram in microseconds.
 \-m
 Output histogram in milliseconds.
 .TP
+\-F
+Print a separate histogram per function matched.
+.TP
 \-r
 Use regular expressions for the search pattern.
 .SH EXAMPLES
@@ -68,6 +76,10 @@ Time vfs_read() for process ID 181 only:
 Time both vfs_fstat() and vfs_fstatat() calls, by use of a wildcard:
 #
 .B funclatency 'vfs_fstat*'
+.TP
+Time both vfs_fstat* calls, and print a separate histogram for each:
+#
+.B funclatency -F 'vfs_fstat*'
 .SH FIELDS
 .TP
 necs
diff --git a/tools/funclatency b/tools/funclatency
index 8ff020e0871a7108d354b4f9a365dccec7aca9a2..21438fb8b8d3840050b9b61d560fa14931bed227 100755
--- a/tools/funclatency
+++ b/tools/funclatency
@@ -12,6 +12,11 @@
 # multiple functions is of limited use, since the output has one histogram for
 # everything. Future versions should split the output histogram by the function.
 #
+# Currently nested or recursive functions are not supported properly, and
+# timestamps will be overwritten, creating dubious output. Try to match single
+# functions, or groups of functions that run at the same stack layer, and
+# don't ultimately call each other.
+#
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
@@ -31,6 +36,7 @@ examples = """examples:
     ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
     ./funclatency -p 181 vfs_read   # time process 181 only
     ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency -F 'vfs_r*'       # show one histogram per matched function
 """
 parser = argparse.ArgumentParser(
 	description="Time kernel funcitons and print latency as a histogram",
@@ -46,6 +52,8 @@ parser.add_argument("-u", "--microseconds", action="store_true",
 	help="microsecond histogram")
 parser.add_argument("-m", "--milliseconds", action="store_true",
 	help="millisecond histogram")
+parser.add_argument("-F", "--function", action="store_true",
+	help="show a separate histogram per function")
 parser.add_argument("-r", "--regexp", action="store_true",
 	help="use regular expressions. Default is \"*\" wildcards only.")
 parser.add_argument("pattern",
@@ -62,8 +70,13 @@ bpf_text = """
 #include <uapi/linux/ptrace.h>
 #include <linux/blkdev.h>
 
+typedef struct ip_key {
+	u64 ip;
+	u64 slot;
+} ip_key_t;
+
 BPF_HASH(start, u32);
-BPF_HISTOGRAM(dist);
+STORAGE
 
 int trace_func_entry(struct pt_regs *ctx)
 {
@@ -71,6 +84,7 @@ int trace_func_entry(struct pt_regs *ctx)
 	u64 ts = bpf_ktime_get_ns();
 
 	FILTER
+	ENTRYSTORE
 	start.update(&pid, &ts);
 
 	return 0;
@@ -91,11 +105,13 @@ int trace_func_return(struct pt_regs *ctx)
 	FACTOR
 
 	// store as histogram
-	dist.increment(bpf_log2l(delta));
+	STORE
 
 	return 0;
 }
 """
+
+# code substitutions
 if args.pid:
 	bpf_text = bpf_text.replace('FILTER',
 	    'if (pid != %s) { return 0; }' % args.pid)
@@ -110,6 +126,21 @@ elif args.microseconds:
 else:
 	bpf_text = bpf_text.replace('FACTOR', '')
 	label = "nsecs"
+if args.function:
+	bpf_text = bpf_text.replace('STORAGE', 'BPF_HASH(ipaddr, u32);\n' +
+	    'BPF_HISTOGRAM(dist, ip_key_t);')
+	# stash the IP on entry, as on return it's kretprobe_trampoline:
+	bpf_text = bpf_text.replace('ENTRYSTORE',
+	    'u64 ip = ctx->ip; ipaddr.update(&pid, &ip);')
+	bpf_text = bpf_text.replace('STORE',
+	    'u64 ip, *ipp = ipaddr.lookup(&pid); if (ipp) { ip = *ipp; ' +
+	    'dist.increment((ip_key_t){ip, bpf_log2l(delta)}); ' +
+	    'ipaddr.delete(&pid); }')
+else:
+	bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+	bpf_text = bpf_text.replace('ENTRYSTORE', '')
+	bpf_text = bpf_text.replace('STORE',
+	    'dist.increment(bpf_log2l(delta));')
 if debug:
 	print(bpf_text)
 
@@ -130,6 +161,20 @@ if matched == 0:
 print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
     (matched / 2, args.pattern))
 
+# custom output (from __init__.py)
+def print_log2_hist_byfunc(self, val_type="value"):
+	tmp = {}
+	f1 = self.Key._fields_[0][0]
+	f2 = self.Key._fields_[1][0]
+	for k, v in self.items():
+	    bucket = getattr(k, f1)
+	    vals = tmp[bucket] = tmp.get(bucket, [0] * 65)
+	    slot = getattr(k, f2)
+	    vals[slot] = v.value
+	for bucket, vals in tmp.items():
+	    print("\nFunction = %s" % BPF.ksym(bucket))
+	    self._print_log2_hist(vals, val_type, 0)
+
 # output
 exiting = 0 if args.interval else 1
 dist = b.get_table("dist")
@@ -145,7 +190,10 @@ while (1):
 	if args.timestamp:
 		print("%-8s\n" % strftime("%H:%M:%S"), end="")
 
-	dist.print_log2_hist(label)
+	if args.function:
+		print_log2_hist_byfunc(dist, label)
+	else:
+		dist.print_log2_hist(label)
 	dist.clear()
 
 	if exiting:
diff --git a/tools/funclatency_example.txt b/tools/funclatency_example.txt
index f6182671734fddfe3c7e7365ca106544a799e7e0..64acd81c1fc08413d31dc79532dbf0031bae49a7 100644
--- a/tools/funclatency_example.txt
+++ b/tools/funclatency_example.txt
@@ -195,10 +195,52 @@ Detaching...
 The distribution between 64 and 511 milliseconds shows keystroke latency.
 
 
+The -F option can be used to print a histogram per function. Eg:
+
+# ./funclatency -uF 'vfs_r*'
+Tracing 5 functions for "vfs_r*"... Hit Ctrl-C to end.
+^C
+
+Function = vfs_read
+     usecs               : count     distribution
+         0 -> 1          : 1044     |****************************************|
+         2 -> 3          : 383      |**************                          |
+         4 -> 7          : 76       |**                                      |
+         8 -> 15         : 41       |*                                       |
+        16 -> 31         : 26       |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 4        |                                        |
+      4096 -> 8191       : 2        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 2        |                                        |
+     65536 -> 131071     : 5        |                                        |
+    131072 -> 262143     : 5        |                                        |
+    262144 -> 524287     : 3        |                                        |
+    524288 -> 1048575    : 7        |                                        |
+
+Function = vfs_rename
+     usecs               : count     distribution
+         0 -> 1          : 2        |****                                    |
+         2 -> 3          : 2        |****                                    |
+         4 -> 7          : 2        |****                                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 6        |*************                           |
+        32 -> 63         : 18       |****************************************|
+Detaching...
+
+
+
 USAGE message:
 
-# ./funclatency  -h
-usage: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-r] pattern
+# ./funclatency -h
+usage: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r]
+                   pattern
 
 Time kernel funcitons and print latency as a histogram
 
@@ -213,6 +255,7 @@ optional arguments:
   -T, --timestamp       include timestamp on output
   -u, --microseconds    microsecond histogram
   -m, --milliseconds    millisecond histogram
+  -F, --function        show a separate histogram per function
   -r, --regexp          use regular expressions. Default is "*" wildcards
                         only.
 
@@ -223,3 +266,4 @@ examples:
     ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
     ./funclatency -p 181 vfs_read   # time process 181 only
     ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency -F 'vfs_r*'       # show one histogram per matched function