------------恢复内容开始------------
来自https://blog.csdn.net/sinat_36190649/article/details/107941638,根据需求略做修改
import pymysql import pandas as pd import gc import time import threading class Sql_df(object): def __init__(self,input_db): self.host = ip地址 self.port = 3306 self.username = 用户名 self.password = 密码 self.input_db = input_db self.conn = pymysql.connect(host = self.host,port = self.port,user = self.username,passwd = self.password,db = self.input_db,charset = 'utf8') def sql_input_all(self,sql_state): #全部读取用pd处理 cur_1 = self.conn.cursor(cursor = pymysql.cursors.DictCursor) cur_1.execute(sql_state+' limit 1') column_df = cur_1.fetchall() column_list = column_df[0].keys() cur_2 = self.conn.cursor() start_time = time.time() cur_2.execute(sql_state) tmp_list = cur_2.fetchall() result_df = pd.DataFrame(list(tmp_list),columns = column_list) end_time = time.time() during_time = round(end_time-start_time,0)/60 print('input data has spend %f minutes'%during_time) return result_df def sql_input_batch(self,sql_state,nums_sql_state,batch_size): #分批处理写入txt文件 cur_2 = self.conn.cursor() start_time = time.time() cur_2.execute(nums_sql_state) nums_sample = cur_2.fetchall()[0][0] batches = nums_sample//batch_size cur_3 = self.conn.cursor() for i in range(batches): cur_3.execute(sql_state+' limit '+str(i*batch_size)+','+str(batch_size)) tmp_d = cur_3.fetchall() tmp_list = ['. '.join(i) for i in tmp_d] with open('./all_data.txt', 'a+', encoding='utf-8') as f: for i in tmp_list: line = i.split() if len(line) < 30: continue new = content_pro(line) f.write(new+'\n') del tmp_d del tmp_list gc.collect() gc.collect() last_index = batches*batch_size cur_3.execute(sql_state+' limit '+str(last_index)+','+str(nums_sample-last_index)) tmp_d = cur_3.fetchall() tmp_list = ['. '.join(i) for i in tmp_d] with open('./all_data.txt', 'a+', encoding='utf-8') as f: for i in tmp_list: line = i.split() if len(line) < 30: continue new = content_pro(line) f.write(new+'\n') end_time = time.time() during_time = round(end_time-start_time,0)/60 print('input data has spend %f minutes'%during_time) del tmp_d gc.collect() return 'success' if __name__ == '__main__': data_input = Sql_df(数据库名) rr = data_input.sql_input_batch('select * from 表名','select count(1) from 表名',1000000)
------------恢复内容结束------------