Lets say my structure is like this
/-- am here
/one/some/dir
/two
/three/has/many/leaves
/hello/world
and say /one/some/dir contains a big file, 500mb, and /three/has/many/leaves contains a 400mb file in each folder.
I want to generate the size for each directory, to have this output
/ - in total for all
/one/some/dir 500mb
/two 0
/three/has/many/leaved - 400mb
/three/has/many 800
/three/has/ 800+someotherbigfilehere
How would I go about this?
Have a look at os.walk
. Specifically, the documentation has an example to find the size of a directory:
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
print root, "consumes",
print sum(getsize(join(root, name)) for name in files),
print "bytes in", len(files), "non-directory files"
if 'CVS' in dirs:
dirs.remove('CVS') # don't visit CVS directories
This should be easy enough to modify for your purposes.
Here's an untested version in response to your comment:
import os
from os.path import join, getsize
dirs_dict = {}
#We need to walk the tree from the bottom up so that a directory can have easy
# access to the size of its subdirectories.
for root, dirs, files in os.walk('python/Lib/email',topdown = False):
# Loop through every non directory file in this directory and sum their sizes
size = sum(getsize(join(root, name)) for name in files)
# Look at all of the subdirectories and add up their sizes from the `dirs_dict`
subdir_size = sum(dirs_dict[join(root,d)] for d in dirs)
# store the size of this directory (plus subdirectories) in a dict so we
# can access it later
my_size = dirs_dict[root] = size + subdir_size
print '%s: %d'%(root,my_size)
Actually @mgilson answer is not working if there are symbolic links in the directories. To allow that you have to do that instead :
dirs_dict = {}
for root, dirs, files in os.walk(directory, topdown=False):
if os.path.islink(root):
dirs_dict[root] = 0L
else:
dir_size = getsize(root)
# Loop through every non directory file in this directory and sum their sizes
for name in files:
full_name = join(root, name)
if os.path.islink(full_name):
nsize = 0L
else:
nsize = getsize(full_name)
dirs_dict[full_name] = nsize
dir_size += nsize
# Look at all of the subdirectories and add up their sizes from the `dirs_dict`
subdir_size = 0L
for d in dirs:
full_d = join(root, d)
if os.path.islink(full_d):
dirs_dict[full_d] = 0L
else:
subdir_size += dirs_dict[full_d]
dirs_dict[root] = dir_size + subdir_size
The following script prints directory size of all sub-directories for the specified directory. This script should be independent from the platform - Posix/Windows/etc. It also tries to benefit (if possible) from caching the calls of a recursive functions. If an argument is omitted, the script will work in the current directory. The output is sorted by the directory size from biggest to smallest ones. So you can adapt it for your needs.
PS i've used recipe 578019 for showing directory size in human-friendly format
from __future__ import print_function
import os
import sys
import operator
def null_decorator(ob):
return ob
if sys.version_info >= (3,2,0):
import functools
my_cache_decorator = functools.lru_cache(maxsize=4096)
else:
my_cache_decorator = null_decorator
start_dir = os.path.normpath(os.path.abspath(sys.argv[1])) if len(sys.argv) > 1 else '.'
@my_cache_decorator
def get_dir_size(start_path = '.'):
total_size = 0
if 'scandir' in dir(os):
# using fast 'os.scandir' method (new in version 3.5)
for entry in os.scandir(start_path):
if entry.is_dir(follow_symlinks = False):
total_size += get_dir_size(entry.path)
elif entry.is_file(follow_symlinks = False):
total_size += entry.stat().st_size
else:
# using slow, but compatible 'os.listdir' method
for entry in os.listdir(start_path):
full_path = os.path.abspath(os.path.join(start_path, entry))
if os.path.islink(full_path):
continue
if os.path.isdir(full_path):
total_size += get_dir_size(full_path)
elif os.path.isfile(full_path):
total_size += os.path.getsize(full_path)
return total_size
def get_dir_size_walk(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return total_size
def bytes2human(n, format='%(value).0f%(symbol)s', symbols='customary'):
"""
(c) http://code.activestate.com/recipes/578019/
Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
see: https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
>>> bytes2human(0)
'0.0 B'
>>> bytes2human(0.9)
'0.0 B'
>>> bytes2human(1)
'1.0 B'
>>> bytes2human(1.9)
'1.0 B'
>>> bytes2human(1024)
'1.0 K'
>>> bytes2human(1048576)
'1.0 M'
>>> bytes2human(1099511627776127398123789121)
'909.5 Y'
>>> bytes2human(9856, symbols="customary")
'9.6 K'
>>> bytes2human(9856, symbols="customary_ext")
'9.6 kilo'
>>> bytes2human(9856, symbols="iec")
'9.6 Ki'
>>> bytes2human(9856, symbols="iec_ext")
'9.6 kibi'
>>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
'9.8 K/sec'
>>> # precision can be adjusted by playing with %f operator
>>> bytes2human(10000, format="%(value).5f %(symbol)s")
'9.76562 K'
"""
SYMBOLS = {
'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'),
'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'),
}
n = int(n)
if n < 0:
raise ValueError("n < 0")
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i+1)*10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
return format % locals()
return format % dict(symbol=symbols[0], value=n)
############################################################
###
### main ()
###
############################################################
if __name__ == '__main__':
dir_tree = {}
### version, that uses 'slow' [os.walk method]
#get_size = get_dir_size_walk
### this recursive version can benefit from caching the function calls (functools.lru_cache)
get_size = get_dir_size
for root, dirs, files in os.walk(start_dir):
for d in dirs:
dir_path = os.path.join(root, d)
if os.path.isdir(dir_path):
dir_tree[dir_path] = get_size(dir_path)
for d, size in sorted(dir_tree.items(), key=operator.itemgetter(1), reverse=True):
print('%s\t%s' %(bytes2human(size, format='%(value).2f%(symbol)s'), d))
print('-' * 80)
if sys.version_info >= (3,2,0):
print(get_dir_size.cache_info())
Sample output:
37.61M .\subdir_b
2.18M .\subdir_a
2.17M .\subdir_a\subdir_a_2
4.41K .\subdir_a\subdir_a_1
----------------------------------------------------------
CacheInfo(hits=2, misses=4, maxsize=4096, currsize=4)
I achieved this with this code:
def get_dir_size(path=os.getcwd()):
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
dirsize = 0
for f in filenames:
fp = os.path.join(dirpath, f)
size = os.path.getsize(fp)
#print('\t',size, f)
#print(dirpath, dirnames, filenames,size)
dirsize += size
total_size += size
print('\t',dirsize, dirpath)
print(" {0:.2f} Kb".format(total_size/1024))