###方法一:使用hdfs库读取HDFS文件 ###在读取数据时,要加上 encoding='utf-8',否则字符串前面会有b'xxx' ###先写入list,再转为df,注意要对数据进行分列,最后要对指定字段转换数据类型
from hdfs.client import Client client = Client("http://hadoop-1-1:50070") lines = [] with client.read("/user/spark/H2O/Wholesale_customers_data.csv", encoding='utf-8') as reader: for line in reader: lines.append(line.strip()) column_str = lines[0] column_list = column_str.split(',') data = {"item_list":lines[1:]} import pandas as pd df = pd.DataFrame(data=data) df[column_list] = df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")])) ##重新指定列 df.drop("item_list", axis=1, inplace=True) ##删除列 df.dtypes """ Region object Fresh object Milk object Grocery object Frozen object Detergents_Paper object Delicassen object target object dtype: object """ df = df.astype('int') ##将object类型转为int64 df.dtypes """ Region int64 Fresh int64 Milk int64 Grocery int64 Frozen int64 Detergents_Paper int64 Delicassen int64 target int64 dtype: object """
###方法二:采用pydoop库读取HDFS文件
import pydoop.hdfs as hdfs lines = [] with hdfs.open('/user/spark/security/iris.csv', 'rt') as f: for line in f: ##print(line) lines.append(line.strip()) column_list = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species'] data = {"item_list":lines[0:]} import pandas as pd df = pd.DataFrame(data=data) df[column_list] = df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")])) ##重新指定列 df.drop("item_list", axis=1, inplace=True) ##删除列 ##调整数据类型 df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float64') df.dtypes """ Sepal_Length float64 Sepal_Width float64 Petal_Length float64 Petal_Width float64 Species object dtype: object """
###直接运用pd.read_table进行数据读取操作 import pydoop.hdfs as hdfs import pandas as pd ###此份数据含有表头 with hdfs.open('/user/spark/security/iris.csv', 'rt') as f: df = pd.read_table(f) column_list = df.columns[0].split(",") df[column_list] = df.iloc[:,0].apply(lambda x: pd.Series([i for i in x.split(",")])) ##此处注意要写成df.iloc[:,0] df.head() """ Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species Sepal_Length Sepal_Width Petal_Length Petal_Width Species 0 5.1,3.5,1.4,0.2,setosa 5.1 3.5 1.4 0.2 setosa 1 4.9,3,1.4,0.2,setosa 4.9 3 1.4 0.2 setosa 2 4.7,3.2,1.3,0.2,setosa 4.7 3.2 1.3 0.2 setosa 3 4.6,3.1,1.5,0.2,setosa 4.6 3.1 1.5 0.2 setosa 4 5,3.6,1.4,0.2,setosa 5 3.6 1.4 0.2 setosa """ df.drop(df.columns[0], axis=1, inplace=True) df.dtypes """ Sepal_Length object Sepal_Width object Petal_Length object Petal_Width object Species object dtype: object """ #####将'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'这四个字段转换为float类型 df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float') df.dtypes """ Sepal_Length float64 Sepal_Width float64 Petal_Length float64 Petal_Width float64 Species object dtype: object """