football数据集是根据美国大学生足球联赛而创建的一个数据集,包含115支球队(即图中的点)、616条比赛数据(即图中的边)
import networkx as nx import random from tqdm import tqdm from sklearn.decomposition import PCA import matplotlib.pyplot as plt
# 数据加载,构造图 G = nx.read_gml('football.gml') print('----------------') #球队总数 print('len(G):', len(G)) #都有哪些球队 print('G.nodes():', G.nodes()) print('----------------') #都有哪些比赛 # print('G.edges():', G.edges())
运行结果:
len(G): 115 G.nodes(): ['TexasElPaso', 'ArizonaState', 'MiamiFlorida', 'Army', 'MichiganState', 'SanDiegoState', 'ColoradoState', 'MiddleTennesseeState', 'Michigan', 'OhioState', 'AlabamaBirmingham', 'Vanderbilt', 'Mississippi', 'Clemson', 'California', 'NorthCarolina', 'Temple', 'Auburn', 'Maryland', 'Baylor', 'Wisconsin', 'Florida', 'WashingtonState', 'CentralMichigan', 'SouthernMississippi', 'Illinois', 'IowaState', 'BallState', 'Tennessee', 'UtahState', 'Nebraska', 'ArkansasState', 'NewMexico', 'KansasState', 'Indiana', 'Akron', 'SouthCarolina', 'Pittsburgh', 'Memphis', 'Colorado', 'Hawaii', 'Duke', 'Alabama', 'Navy', 'VirginiaTech', 'Tulane', 'Syracuse', 'BostonCollege', 'Arizona', 'FloridaState', 'SouthernMethodist', 'SanJoseState', 'Arkansas', 'TexasTech', 'Wyoming', 'Stanford', 'Kentucky', 'Utah', 'Missouri', 'NorthTexas', 'MississippiState', 'EastCarolina', 'Nevada', 'Purdue', 'PennState', 'LouisianaState', 'Virginia', 'Iowa', 'Tulsa', 'Rutgers', 'NotreDame', 'WesternMichigan', 'AirForce', 'Connecticut', 'Oregon', 'NewMexicoState', 'NorthernIllinois', 'Kansas', 'CentralFlorida', 'Northwestern', 'TexasA&M', 'OregonState', 'Cincinnati', 'LouisianaMonroe', 'Oklahoma', 'TexasChristian', 'Idaho', 'BrighamYoung', 'SouthernCalifornia', 'Marshall', 'Ohio', 'NorthCarolinaState', 'Houston', 'Kent', 'Toledo', 'BowlingGreenState', 'OklahomaState', 'WestVirginia', 'BoiseState', 'Minnesota', 'Texas', 'LouisianaLafayette', 'MiamiOhio', 'NevadaLasVegas', 'GeorgiaTech', 'Louisville', 'LouisianaTech', 'Rice', 'EasternMichigan', 'WakeForest', 'Washington', 'Georgia', 'FresnoState', 'UCLA', 'Buffalo'] ----------------
从一个点出发,随机游走10个顶点。
def get_randomwalk(node, path_length): random_walk = [node] for i in range(path_length-1): temp = list(G.neighbors(node)) temp = list(set(temp) - set(random_walk)) if len(temp) == 0: break random_node = random.choice(temp) random_walk.append(random_node) node = random_node return random_walk print('----------------') print('get_randomwalk:', get_randomwalk('EastCarolina', 10))
运行结果
get_randomwalk: ['EastCarolina', 'AlabamaBirmingham', 'Memphis', 'Houston', 'Louisville', 'Tulane', 'LouisianaLafayette', 'NorthTexas', 'BoiseState', 'NewMexico'] 100%|██████████| 115/115 [00:00<00:00, 3286.87it/s]
# 从图获取所有节点的列表 all_nodes = list(G.nodes()) # 捕获数据集中所有节点的随机游走序列 random_walks = [] for n in tqdm(all_nodes): # 每个节点游走5次,每次最长距离为10 for i in range(5): random_walks.append(get_randomwalk(n, 10)) # 输出随机游走序列,及序列个数 print('----------------') print('random_walks:', random_walks) print('----------------') print(len(random_walks))
from gensim.models import Word2Vec import warnings warnings.filterwarnings('ignore') # 训练skip-gram (word2vec)模型 model = Word2Vec(window = 4, sg = 1, hs = 0, negative = 10, # 负采样 alpha=0.03, min_alpha=0.0007, seed = 14) # 从random_walks中创建词汇表 model.build_vocab(random_walks, progress_per=2) model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1) print('model:', model) # 输出和EastCarolina相似的球队 print('model.similar_by_word:', model.similar_by_word('EastCarolina'))
运行结果
575 model: Word2Vec(vocab=115, size=100, alpha=0.03) model.similar_by_word: [('Army', 0.9480490684509277), ('FloridaState', 0.9365841746330261), ('Louisville', 0.9252589344978333), ('Tulane', 0.9110944867134094), ('AlabamaBirmingham', 0.9061299562454224), ('GeorgiaTech', 0.9037945866584778), ('WakeForest', 0.9031023979187012), ('Virginia', 0.8917067646980286), ('Duke', 0.8815759420394897), ('Maryland', 0.8811277151107788)]
def plot_nodes(word_list): # 每个节点的embedding为100维 X = model[word_list] #print(type(X)) # 将100维向量减少到2维 pca = PCA(n_components=2) result = pca.fit_transform(X) #print(result) # 绘制节点向量 plt.figure(figsize=(12,9)) # 创建一个散点图的投影 plt.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(word_list): plt.annotate(word, xy=(result[i, 0], result[i, 1])) plt.show() # 将所有的球队embedding进行绘制 plot_nodes(model.wv.vocab)
运行结果
# 使用DeepWalk import networkx as nx import random from tqdm import tqdm from sklearn.decomposition import PCA import matplotlib.pyplot as plt # 数据加载,构造图 G = nx.read_gml('football.gml') print('----------------') #球队总数 print('len(G):', len(G)) #都有哪些球队 print('G.nodes():', G.nodes()) print('----------------') #都有哪些比赛 print('G.edges():', G.edges()) """ 随机游走 input: 将节点和被遍历的路径的长度作为输入 output: 返回遍历节点的顺序: """ def get_randomwalk(node, path_length): random_walk = [node] for i in range(path_length-1): temp = list(G.neighbors(node)) temp = list(set(temp) - set(random_walk)) if len(temp) == 0: break random_node = random.choice(temp) random_walk.append(random_node) node = random_node return random_walk print('----------------') print('get_randomwalk:', get_randomwalk('EastCarolina', 10)) # 从图获取所有节点的列表 all_nodes = list(G.nodes()) # 捕获数据集中所有节点的随机游走序列 random_walks = [] for n in tqdm(all_nodes): # 每个节点游走5次,每次最长距离为10 for i in range(5): random_walks.append(get_randomwalk(n, 10)) # 输出随机游走序列,及序列个数 print('----------------') print('random_walks:', random_walks) print('----------------') print(len(random_walks)) # 使用skip-gram,提取模型学习到的权重 from gensim.models import Word2Vec import warnings warnings.filterwarnings('ignore') # 训练skip-gram (word2vec)模型 model = Word2Vec(window = 4, sg = 1, hs = 0, negative = 10, # 负采样 alpha=0.03, min_alpha=0.0007, seed = 14) # 从random_walks中创建词汇表 model.build_vocab(random_walks, progress_per=2) model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1) print('model:', model) # 输出和EastCarolina相似的球队 print('model.similar_by_word:', model.similar_by_word('EastCarolina')) # 在二维空间中绘制所选节点的向量 def plot_nodes(word_list): # 每个节点的embedding为100维 X = model[word_list] #print(type(X)) # 将100维向量减少到2维 pca = PCA(n_components=2) result = pca.fit_transform(X) #print(result) # 绘制节点向量 plt.figure(figsize=(12,9)) # 创建一个散点图的投影 plt.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(word_list): plt.annotate(word, xy=(result[i, 0], result[i, 1])) plt.show() # 将所有的球队embedding进行绘制 plot_nodes(model.wv.vocab)