# Import modules
import ee
import requests
import io
import json
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mpd
from matplotlib.colors import rgb2hex
15 Export data
In many cases, it’s necessary to store data from Google Earth Engine (GEE) locally for offline analysis and integration with other projects and tools. This notebook demonstrates techniques for saving time series data and images to a local hard drive, with similar examples found throughout the book.
A critical factor in downloading data from GEE is its limitation on data size per download request. Downloading and processing large datasets locally negates the advantage of GEE, which is designed to accelerate data processing through its advanced infrastructure.
GEE limits image downloads to a maximum size of 32 MB or a maximum grid dimension of 10,000 pixels.
# Authenticate GEE
#ee.Authenticate()
# Initialize GEE
ee.Initialize()
Select region for tutorial
# Read US watersheds using hydrologic unit codes (HUC)
= ee.FeatureCollection("USGS/WBD/2017/HUC12")
watersheds = watersheds.filter(ee.Filter.eq('huc12', '102701010204')).first()
mcdowell_creek = ee.Geometry.Point([-96.556316, 39.084535])
watershed_point
# Create mask for the region
= ee.Image.constant(1).clip(mcdowell_creek).mask() mask
Raster data
Save geotiff image
In this example we will request a small georeferenced image and then save it in .geotiff format. We will also read the image back into the notebook.
# Import map from Digital Elevation Model (DEM)
= ee.Image('CGIAR/SRTM90_V4') srtm
# Clip elevation map to boundaries of McDowell Creek
# and mask points outside the watershed boundary
= srtm.clip(mcdowell_creek).mask(mask) mcdowell_creek_elv
# Define coordinate reference system
= 32614 # UTM14 Projected coordinates in meters
crs #crs = 4326 # WGS84 Geographic coordinates in degrees
# Get URL link to full image
= mcdowell_creek_elv.getDownloadUrl({
image_url 'region': mcdowell_creek.geometry(),
'scale':30,
'crs': f'EPSG:{crs}',
'format': 'GEO_TIFF'
})
# Display clickable URL link to download TIFF image
print(image_url)
https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/thumbnails/a3c4c75d3ef68b3a9f43b5b1f66fab2a-8e5a43ff7247a0f523a367d9e6d98265:getPixels
If we request the image using "scale: 1"
, meaning that we want an image with each pixel representing 1 square meter, then for this particular watershed ww will get the following error message: "Total request size (691287720 bytes) must be less than or equal to 50331648 bytes."
, which means that we are requesting too much data. To solve this issue we need to reduce the number of pixels by re-scaling the image so that we to meet the quota allowed by GEE.
# Request data using URL and save data as a new GeoTiff file
= requests.get(image_url)
response
# Write file to disk
= '../outputs/mc_dowell_creek_elevation.tif'
filename with open(filename, 'wb') as f:
f.write(response.content)
# Read GeoTiff file using the Xarray package
= xr.open_dataarray(filename).squeeze() raster
# Create a figure
='magma', cbar_kwargs={'label':'Elevation m'});
raster.plot(cmap'Kings Creek Elevation Map')
plt.title('Easting (meters)')
plt.xlabel('Northing (meters)')
plt.ylabel(
plt.tight_layout() plt.show()
In-memory GeoTiff
Sometimes we simply want to retrieve an image and plot it, without necessarily saving it into our local drive.
If the code below does not work, try installating the rioxarray
package using !pip install rioxarray
, which extends xarray
and allows it to use the rasterio engine
to read bytes file.
# Open temporary file in memory, save the content
# of the request (this is the geotiff image) in a temporary file,
# then read the temporary file and plot the map.
with io.BytesIO(response.content) as memory_file:
= xr.open_dataarray(memory_file, engine='rasterio').squeeze()
raster
# Create plot while the file is open
='inferno', cbar_kwargs={'label':'Elevation m'});
raster.plot(cmap'Kings Creek Elevation Map')
plt.title('Easting (meters)')
plt.xlabel('Northing (meters)')
plt.ylabel(
plt.tight_layout()
plt.show()
Handy function to save GeoTiffs
Since this is a common operation, let’s wrap the code we wrote above into a function.
def save_geotiff(ee_image, filename, crs, scale, geom):
"""
Function to save images from Google Earth Engine into local hard drive.
"""
= ee_image.getDownloadUrl({
image_url 'region': geom,
'scale':scale,
'crs': f'EPSG:{crs}',
'format': 'GEO_TIFF'})
# Request data using URL and save data as a new GeoTiff file
= requests.get(image_url)
response with open(filename, 'wb') as f:
f.write(response.content)return print(f'Image {filename} saved.')
Get Numpy array
Sometimes instead of the georeferenced image we want the raw data in the form of a Numpy array, so that we can further process the data in Python.
# Use the requests method to read image
= mcdowell_creek_elv.getDownloadUrl({'crs': f'EPSG:{crs}',
numpy_array_url 'scale':30,
'format': 'NPY'})
= requests.get(numpy_array_url )
response = np.load(io.BytesIO(response.content), encoding='bytes').astype(np.float64)
elev_array print(elev_array)
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
# Convert zero values in the mask to NaN
= elev_array == 0
idx_zero = np.nan
elev_array[idx_zero]
# Create histogram of elevation
=(5,4))
plt.figure(figsize
plt.hist(elev_array.flatten(), ='scott',
bins='lightblue', edgecolor='k');
facecolor'Elevation (m)')
plt.xlabel('Density')
plt.ylabel( plt.show()
print('Array dimensions:', elev_array.shape)
print('Total pixels:', elev_array.size)
Array dimensions: (496, 401)
Total pixels: 198896
Save thumbnail image
A thumbnail image is good for display purposes. This is not the actual, high-resolution image. This image does not have any elevation data or geographic coordinates. The color of each pixel was assigned according to the colormap and range of values provided when requesting the thumbnail image.
# Create the url associated to the Image you want
= [rgb2hex(c) for c in plt.get_cmap('magma').colors]
cmap = mcdowell_creek_elv.getThumbUrl({'min': 300,
thumbnail_url 'max': 450,
'dimensions': 512,
'palette': cmap})
# Print URL
print(thumbnail_url)
https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/thumbnails/f766af93b23eddb9c6dfc16a16b2dc33-8904153edffc111c602c82500a6f915f:getPixels
# Display a thumbnail of elevation map
= requests.get(thumbnail_url )
response = plt.imread(io.BytesIO(response.content))
img ='magma')
plt.imshow(img, cmap'off')
plt.axis('../outputs/kings_creek_thumbnail.jpg')
plt.savefig( plt.show()
Vector data
Get Feature coordinates into Pandas DataFrame
In this example we will simply access the latitude and longitude fields of a Kansas county.
# US Counties dataset
= ee.FeatureCollection("TIGER/2018/Counties")
US_counties
# '20161' is the ID for Riley county in Kansas
= US_counties.filter(ee.Filter.eq('GEOID','20161'))
county
# Get coordinates of the county geometryand put them in a dataframe for easy data handling.
= pd.DataFrame(county.first().getInfo()['geometry']['coordinates'][0])
df = ['lon','lat']
df.columns df.head()
lon | lat | |
---|---|---|
0 | -96.961683 | 39.220095 |
1 | -96.961369 | 39.220095 |
2 | -96.956566 | 39.220005 |
3 | -96.954188 | 39.220005 |
4 | -96.952482 | 39.220005 |
Save data in tabular format
In this example the goal is to request temporal soil moisture data, convert it into a Pandas DataFrame, and then save it as a comma-separated value file.
# Load SMAP product (3-hour resolution)
= ee.ImageCollection("NASA/SMAP/SPL4SMGP/007").filterDate('2018-01-01','2018-01-31')
smap
# Select rootzone soil moisture band
= smap.select('sm_rootzone')
sm_rootzone
# Get rootzone soil moisture for watershed point
= sm_rootzone.getRegion(watershed_point, scale=1).getInfo() watershed_sm
# Convert output into a Pandas Dataframe
= pd.DataFrame(watershed_sm[1:])
df = watershed_sm[0]
df.columns 'time'] = pd.to_datetime(df['time'], unit='ms')
df[3) df.head(
id | longitude | latitude | time | sm_rootzone | |
---|---|---|---|---|---|
0 | 20180101_0130 | -96.556312 | 39.084535 | 2018-01-01 01:30:00 | 0.297576 |
1 | 20180101_0430 | -96.556312 | 39.084535 | 2018-01-01 04:30:00 | 0.297541 |
2 | 20180101_0730 | -96.556312 | 39.084535 | 2018-01-01 07:30:00 | 0.297562 |
# Save Dataframe to local drive
'../outputs/smap_rootzone_2018.csv', index=False) df.to_csv(
# Create figure to visualize data
= mpd.ConciseDateFormatter(mpd.AutoDateLocator())
date_fmt =(6,3))
plt.figure(figsize'Kings Creek Watershed')
plt.title('time'], df['sm_rootzone']*100)
plt.plot(df['Volumetric water content (%)')
plt.ylabel(
plt.gca().xaxis.set_major_formatter(date_fmt) plt.show()
Save Feature as geoJSON
We will save a polygon feature representing the boundary of the McDowell Creek small watershed in central Kansas as a geoJSON file.
# Convert the feature to a GeoJSON string
= mcdowell_creek.getInfo() # Retrieves the information about the feature
feature_info
# Save the GeoJSON string to a file
with open('../outputs/kings_creek_from_gee.geojson', 'w') as file:
# Convert dictionary to a GeoJSON string and save it
file.write(json.dumps(feature_info))
In this example we will save the boundary of the state of Kansas.
# Read US states
= ee.FeatureCollection("TIGER/2018/States")
US_states
# Select Kansas
= US_states.filter(ee.Filter.inList('NAME',['Kansas']))
state
# Retrieves the information about the feature
= state.getInfo()
state_info
# Save the GeoJSON string to a file
with open('../outputs/kansas_bnd_from_gee.geojson', 'w') as file:
# Convert dictionary to a GeoJSON string and save it
file.write(json.dumps(state_info))
Save FeatureCollection as shapefile
This section requires the GeoPandas library.
# Import GeoPandas
import geopandas as gpd
# Convert FeatureCollection to GeoDataFrame
= gpd.GeoDataFrame.from_features([feature_info])
gdf
# Create a figure to quickly inspect the dataset
='None'); gdf.plot(facecolor
# Save as shapefile
'../outputs/kings_creek_from_gee.shp', index=False) gdf.to_file(
# Save as geojson using GeoPandas
'../outputs/kings_creek_from_gee.geojson', driver='GeoJSON', index=False) gdf.to_file(
Save FeatureCollection as shapefile (Advanced)
In this case we will save a FeatureCollection with Polygon features and with GeometryCollection features, which are not supported by shapefiles. So we will need to iterate over each geometry, and only select polygons for those geometries that contain other features, like Points or LineStrings. Using geojson is recommended in this case to avoid the extra processing to fit the specifications of ESRI Shapefiles.
Ideally all the counties should be Polygons, but sometimes geometries bordering rivers or other states can result in GeometryCollections.
# Read US counties and filter bounds for selected region
= ee.FeatureCollection("TIGER/2018/Counties")
counties
# Get FeatureCollection data
# Kansas has STATEFP number equal to 20
= counties.filter(ee.Filter.eq('STATEFP','20')).getInfo() counties_data
# Define coordinate reference system
= 4326 # World Geodetic System 1984
wgs84
# Convert FeatureCollection to GeoDataFrame
= gpd.GeoDataFrame.from_features(counties_data, crs=wgs84)
gdf_counties 3) gdf_counties.head(
geometry | ALAND | AWATER | CBSAFP | CLASSFP | COUNTYFP | COUNTYNS | CSAFP | FUNCSTAT | GEOID | INTPTLAT | INTPTLON | LSAD | METDIVFP | MTFCC | NAME | NAMELSAD | STATEFP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POLYGON ((-97.15333 37.47553, -97.15329 37.474... | 2915648163 | 17322935 | 11680 | H1 | 035 | 00484987 | 556 | A | 20035 | +37.2345068 | -096.8372468 | 06 | G4020 | Cowley | Cowley County | 20 | |
1 | POLYGON ((-97.15347 37.82517, -97.15342 37.824... | 3702816810 | 43671391 | 48620 | H1 | 015 | 00484977 | 556 | A | 20015 | +37.7736487 | -096.8388402 | 06 | G4020 | Butler | Butler County | 20 | |
2 | POLYGON ((-97.80760 37.47387, -97.80755 37.472... | 3060429574 | 8589030 | 48620 | H1 | 191 | 00481812 | 556 | A | 20191 | +37.2366617 | -097.4933519 | 06 | G4020 | Sumner | Sumner County | 20 |
# Find if we have features that are not polygons
type != 'Polygon'] gdf_counties[gdf_counties.geometry.
geometry | ALAND | AWATER | CBSAFP | CLASSFP | COUNTYFP | COUNTYNS | CSAFP | FUNCSTAT | GEOID | INTPTLAT | INTPTLON | LSAD | METDIVFP | MTFCC | NAME | NAMELSAD | STATEFP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
46 | GEOMETRYCOLLECTION (LINESTRING (-96.52551 36.9... | 1654694134 | 15243776 | H1 | 019 | 00484979 | A | 20019 | +37.1542592 | -096.2453962 | 06 | G4020 | Chautauqua | Chautauqua County | 20 | |||
64 | GEOMETRYCOLLECTION (LINESTRING (-98.34961 37.3... | 2075290355 | 3899269 | H1 | 077 | 00485004 | A | 20077 | +37.1881840 | -098.0665901 | 06 | G4020 | Harper | Harper County | 20 | |||
99 | GEOMETRYCOLLECTION (LINESTRING (-94.87632 39.7... | 1019105692 | 12424173 | 41140 | H1 | 043 | 00484991 | 312 | A | 20043 | +39.7885021 | -095.1472253 | 06 | G4020 | Doniphan | Doniphan County | 20 |
# Let's inspect one of them to see what's inside
# Here we have a LineString and a Polygon.
list(gdf_counties.loc[46,'geometry'].geoms)
# The shapefile will not be able to save this. Let's retain the polygon only
[<LINESTRING (-96.526 36.999, -96.526 36.999)>,
<POLYGON ((-96.526 37.028, -96.526 37.016, -96.526 37.016, -96.526 37.008, -...>]
# Iterate over each row. Here we assume that there is
# only one polygon in each row that has a GeometryCollection.
for k,row in gdf_counties.iterrows():
# Check if row geometry is 'GeometryCollection'
if row['geometry'].geom_type == 'GeometryCollection':
# Iterate over each component of the GeometryCollection
for geom in list(row['geometry'].geoms):
# Overwrite geometry of row with polygon
if geom.geom_type == 'Polygon':
'geometry'] = geom
gdf_counties.loc[k,
# For multiple polygons you can keep the largest one using
# something like: max(polygons, key=lambda a: a.area)
# Check that we don't have any rows left that is not a polygon
# Returned GeoDataFrame should be empty
type != 'Polygon'] gdf_counties[gdf_counties.geometry.
geometry | ALAND | AWATER | CBSAFP | CLASSFP | COUNTYFP | COUNTYNS | CSAFP | FUNCSTAT | GEOID | INTPTLAT | INTPTLON | LSAD | METDIVFP | MTFCC | NAME | NAMELSAD | STATEFP |
---|
# Plot counties to ensure we have all of them
='None'); gdf_counties.plot(facecolor
# Now we can save the GeoDataframe as a Shapefile
'../outputs/kansas_counties_from_gee.shp', index=False) gdf_counties.to_file(