python读取HDFS文件

首页 / 新闻资讯 / 正文

background-shape background-shape background-shape background-shape background-shape background-shape
###方法一:使用hdfs库读取HDFS文件 ###在读取数据时,要加上 encoding='utf-8',否则字符串前面会有b'xxx' ###先写入list,再转为df,注意要对数据进行分列,最后要对指定字段转换数据类型 
from hdfs.client import Client client = Client("http://hadoop-1-1:50070")  lines = [] with client.read("/user/spark/H2O/Wholesale_customers_data.csv", encoding='utf-8') as reader:     for line in reader:           lines.append(line.strip())  column_str = lines[0] column_list = column_str.split(',')  data = {"item_list":lines[1:]}  import pandas as pd df = pd.DataFrame(data=data) df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列 df.drop("item_list", axis=1, inplace=True)  ##删除列  df.dtypes """ Region              object Fresh               object Milk                object Grocery             object Frozen              object Detergents_Paper    object Delicassen          object target              object dtype: object  """   df = df.astype('int')  ##将object类型转为int64 df.dtypes """ Region              int64 Fresh               int64 Milk                int64 Grocery             int64 Frozen              int64 Detergents_Paper    int64 Delicassen          int64 target              int64 dtype: object """ 
###方法二:采用pydoop库读取HDFS文件 
import pydoop.hdfs as hdfs  lines = [] with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:     for line in f:         ##print(line)         lines.append(line.strip())   column_list = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']  data = {"item_list":lines[0:]}  import pandas as pd df = pd.DataFrame(data=data) df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列 df.drop("item_list", axis=1, inplace=True)  ##删除列  ##调整数据类型 df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float64')  df.dtypes """ Sepal_Length    float64 Sepal_Width     float64 Petal_Length    float64 Petal_Width     float64 Species          object dtype: object """ 
###直接运用pd.read_table进行数据读取操作 import pydoop.hdfs as hdfs import pandas as pd  ###此份数据含有表头 with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:     df = pd.read_table(f)   column_list = df.columns[0].split(",") df[column_list] =  df.iloc[:,0].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##此处注意要写成df.iloc[:,0]  df.head() """ Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Species 0	5.1,3.5,1.4,0.2,setosa	5.1	3.5	1.4	0.2	setosa 1	4.9,3,1.4,0.2,setosa	4.9	3	1.4	0.2	setosa 2	4.7,3.2,1.3,0.2,setosa	4.7	3.2	1.3	0.2	setosa 3	4.6,3.1,1.5,0.2,setosa	4.6	3.1	1.5	0.2	setosa 4	5,3.6,1.4,0.2,setosa	5	3.6	1.4	0.2	setosa """   df.drop(df.columns[0], axis=1, inplace=True) df.dtypes """ Sepal_Length    object Sepal_Width     object Petal_Length    object Petal_Width     object Species         object dtype: object """   #####将'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'这四个字段转换为float类型 df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float')  df.dtypes """ Sepal_Length    float64 Sepal_Width     float64 Petal_Length    float64 Petal_Width     float64 Species          object dtype: object """  

转载于:https://my.oschina.net/kyo4321/blog/3016864

bg-shape bg-shape