1.对
/usr/local/spark/conf/spark-env.sh
的配置文件进行修改,加入
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
2.其次,再对~/.bashrc配置文件进行修改,加入
# spark python export SPARK_HOME=/usr/local/spark export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.2-src.zip:PYTHONPATH export PYSPARK_PYTHON=python3 export PATH=$PATH:$SPARK_HOME/bin
3.配置好环境变量之后,进行
source ~/.bashrc
exit()
、quit()
退出sparktxt = open("/usr/local/spark/pythonspark/workcount.txt", "r").read() # 读取文件
txt = txt.lower() # 把所有字母都变成小写,便于统计 #将文本中特殊字符替换为空格 for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~': txt = txt.replace(ch, " ") return txt
hamletTxt = getText()
f = open("/usr/local/spark/pythonspark/ceasetext.txt","r",encoding = 'UTF-8') a = f.read() ceasetext_n= a f.close()
停用词列表如下
# 将停用词剔除掉 words = hamletTxt.split() # 将文本用空格分隔 finalword = [] for word in words: if word not in ceasetext_n: finalword.append(word)
counts = {} # 统计单词出现的次数 for word in finalword: counts[word] = counts.get(word,0) + 1 items = list(counts.items()) # 排序,按单词出现的次数从大到小排好序 items.sort(key=lambda x:x[1], reverse=True)
for i in range(50): word, count = items[i] print ("{0:<10}{1:>5}".format(word, count))
tar -zxvf pycharm-community-2021.3.2.tar.gz 路径 sudo chown -R 用户名 目标路径
pycharm.sh