一、安装Spark
验证截图:
(Spark运行成功)
(基本操作以及简单运算)
二、Python编程练习:英文文本的词频统计
源码:
path='/home/hadoop/wc/f1.txt' with open(path) as f: #text read and down text=f.read() text = text.lower() #replace marks for ch in '!"@#$%^&*()+,-./:;<=>?@[\\]_`~{|}': text=text.replace(ch," ") words = text.split() #set non-count words stop_words = ['so','out','all','for','of','to','on','in','if','by','under','it','at','into','with','about','i','am','are','is','a','the','and','that','before','her','she','my','be','an','from','would','me','got'] lenwords=len(words) afterwords=[] #count words for i in range(lenwords): z=1 for j in range(len(stop_words)): #avoid non-count words if words[i]==stop_words[j]: continue else: if z==len(stop_words): afterwords.append(words[i]) break z=z+1 continue #get freq counts = {} for word in afterwords: counts[word] = counts.get(word,0) + 1 items = list(counts.items()) items.sort(key=lambda x:x[1],reverse=True) i=1 while i<=len(items): word,count = items[i-1] print("{0:<20}{1}".format(word,count)) i=i+1
运行结果: