import pandas as pd# 读取texas_cities数据集cities = pd.read_csv('data/texas_cities.csv')cities
# 将Geolocation分解为四个单独的列 #expand表示是否把series类型转化为DataFrame类型geolocations = cities.Geolocation.str.split(pat='. ', expand=True)geolocations.columns = ['latitude', 'latitude direction', 'longitude', 'longitude direction']geolocations
# 转变数据类型geolocations = geolocations.astype({'latitude':'float', 'longitude':'float'})geolocations.dtypes'''latitudefloat64latitude directionobjectlongitude float64longitude directionobjectdtype: object'''
# 将新列与原先的city列连起来cities_tidy = pd.concat([cities['City'], geolocations], axis='columns')cities_tidy
原理
# 函数to_numeric可以将每列自动变为整数或浮点数temp = geolocations.apply(pd.to_numeric, errors='ignore')temp
# 再查看数据类型temp.dtypes'''latitudefloat64latitude directionobjectlongitude float64longitude directionobjectdtype: object'''
# |符,可以对多个标记进行分割cities.Geolocation.str.split(pat='° |, ', expand=True)
# 更复杂的提取方式cities.Geolocation.str.extract('([0-9.]+). (N|S), ([0-9.]+). (E|W)', expand=True)