#!/usr/bin/env python """Word frequencies - count word frequencies in a string.""" def word_freq(text): """Return a dictionary of word frequencies for the given text.""" freqs = {} for word in text.split(): freqs.setdefault(word,0) freqs[word] += 1 return freqs def print_vk(lst): """Print a list of value/key pairs nicely formatted in key/value order.""" # Find the longest key: remember, the list has value/key paris, so the key # is element [1], not [0] longest_key = max(map(lambda x: len(x[1]),lst)) # Make a format string out of it fmt = '%'+str(longest_key)+'s -> %s' # Do actual printing for v,k in lst: print fmt % (k,v) def freq_summ(freqs,n=10): """Print a simple summary of a word frequencies dictionary. Inputs: - freqs: a dictionary of word frequencies. Optional inputs: - n: the number of """ words,counts = freqs.keys(),freqs.values() # Sort by count items = zip(counts,words) items.sort() print 'Number of words:',len(freqs) print print '%d least frequent words:' % n print_vk(items[:n]) print print '%d most frequent words:' % n print_vk(items[-n:]) if __name__ == '__main__': import gzip, sys data_file = sys.argv[1] text = gzip.open(data_file).read() freqs = word_freq(text) freq_summ(freqs,15)