# !pip install geopandas pandas pystac shapely


import pandas as pd
from pystac import Catalog, Item
import shapely.geometry


catalog_uri = (
    "https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/stac-v1.0/catalog.json"
)
catalog = Catalog.from_file(catalog_uri)


for collection in catalog.get_collections():
    print(collection.id)

rapidai4eo_v1_source_pf
rapidai4eo_v1_source_s2
rapidai4eo_v1_labels


def items_intersecting_geometry(collection, geometry):
    """Recursively find all STAC items intersecting a geometry.

    Our STAC structure (further detailed in the corpus documentation) has a hierarchy of
    Collections to speed up spatial queries. Recursively search these Collections, and
    return all STAC Items that intersect our geometry.
    """
    intersecting_items = []
    collection_bboxes = [
        shapely.geometry.box(*bounds) for bounds in collection.extent.spatial.bboxes
    ]
    if any([bbox.intersects(geometry) for bbox in collection_bboxes]):

        # Collect all matching items in this collection
        for item in collection.get_items():
            item_geometry = shapely.geometry.shape(item.geometry)
            if item_geometry.intersects(geometry):
                intersecting_items.append(item)

        # Recursively search our nested collections for items
        for subcollection in collection.get_collections():
            intersecting_items += items_intersecting_geometry(subcollection, geometry)

    return intersecting_items


berlin_bbox = shapely.geometry.box(13.05, 52.35, 13.72, 52.69)
pf_collection = catalog.get_child("rapidai4eo_v1_source_pf")


%%time
berlin_pf_items = items_intersecting_geometry(pf_collection, berlin_bbox)
print(f"Found {len(berlin_pf_items)} in the vicinity of Berlin.")

Found 198 in the vicinity of Berlin.
CPU times: user 30.3 s, sys: 2.08 s, total: 32.4 s
Wall time: 2min 58s


example_pf_item = berlin_pf_items[0]
example_pf_assets = example_pf_item.get_assets()
n_head = 5

for i, (asset_name, asset) in enumerate(example_pf_assets.items()):
    print(f"Asset: {asset_name}:")
    print(asset.to_dict())
    print("---")

    if i >= n_head - 1:
        break

Asset: sr_2018-01-03:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-01-03.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-03T10:30:00Z', 'roles': ['data']}
---
Asset: qa_2018-01-03:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-QA/2018-01-03.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-03T10:30:00Z', 'roles': ['metadata']}
---
Asset: sr_2018-01-08:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-01-08.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-08T10:30:00Z', 'roles': ['data']}
---
Asset: qa_2018-01-08:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-QA/2018-01-08.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-08T10:30:00Z', 'roles': ['metadata']}
---
Asset: sr_2018-01-13:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-01-13.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-13T10:30:00Z', 'roles': ['data']}
---


# Define the time period for filtering
start_date = pd.to_datetime("2018-05-01T00:00:00Z")
end_date = pd.to_datetime("2018-06-01T00:00:00Z")

# Filter the assets based on the defined time period
time_filtered_assets = {
    k: v
    for k, v in example_pf_assets.items()
    if start_date < pd.to_datetime(v.extra_fields["datetime"]) < end_date
}

# Print the number of assets and their names within the time period
print(f"{len(time_filtered_assets)} Assets in time period {start_date} - {end_date}:")
for asset_name, _ in time_filtered_assets.items():
    print(f"    {asset_name}.")

# Define the role for further filtering
role = "data"

# Filter the time-filtered assets based on the defined role
role_filtered_assets = {
    k: v for k, v in time_filtered_assets.items() if v.has_role(role)
}

print(f'Of which, {len(role_filtered_assets)} have the role "data".')
for asset_name, _ in role_filtered_assets.items():
    print(f"    {asset_name}.")

12 Assets in time period 2018-05-01 00:00:00+00:00 - 2018-06-01 00:00:00+00:00:
    sr_2018-05-03.
    qa_2018-05-03.
    sr_2018-05-08.
    qa_2018-05-08.
    sr_2018-05-13.
    qa_2018-05-13.
    sr_2018-05-18.
    qa_2018-05-18.
    sr_2018-05-23.
    qa_2018-05-23.
    sr_2018-05-28.
    qa_2018-05-28.
Of which, 6 have the role "data".
    sr_2018-05-03.
    sr_2018-05-08.
    sr_2018-05-13.
    sr_2018-05-18.
    sr_2018-05-23.
    sr_2018-05-28.


print("URLs of filtered Assets:")
for asset_name, asset in role_filtered_assets.items():
    print(asset.href)

URLs of filtered Assets:
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-03.tif
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-08.tif
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-13.tif
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-18.tif
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-23.tif
https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/PF-SR/2018-05-28.tif


query_pf_hrefs = []
pf_asset_query_keys = [*role_filtered_assets.keys()]
for item in berlin_pf_items:
    for k in pf_asset_query_keys:
        query_pf_hrefs.append(item.assets[k].href)
print(f"Found {len(query_pf_hrefs):,} matching URLs.")

Found 1,188 matching URLs.


related_items = [
    link.resolve_stac_object().target
    for link in example_pf_item.links
    if link.rel == "related"
]

for item in related_items:
    print(f"Found related item in Collection {item.collection_id}.")

Found related item in Collection rapidai4eo_v1_labels.
Found related item in Collection rapidai4eo_v1_source_s2.


for item in related_items:
    print(f"Found related item in Collection {item.collection_id}.")

    for i, (asset_name, asset) in enumerate(item.assets.items()):
        print(f"Asset: {asset_name}:")
        print(asset.to_dict())
        print("---")

        if i >= n_head - 1:
            break

    print("---")

Found related item in Collection rapidai4eo_v1_labels.
Asset: labels:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/labels/33N/16E-243N/labels_33N_16E-243N_29_01.geojson', 'type': 'application/geo+json', 'roles': ['labels', 'labels-vector']}
---
---
Found related item in Collection rapidai4eo_v1_source_s2.
Asset: s2_2018-01:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/S2-SR/2018-01.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-01-15T10:30:00Z', 'roles': ['data']}
---
Asset: s2_2018-02:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/S2-SR/2018-02.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-02-15T10:30:00Z', 'roles': ['data']}
---
Asset: s2_2018-03:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/S2-SR/2018-03.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-03-15T10:30:00Z', 'roles': ['data']}
---
Asset: s2_2018-04:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/S2-SR/2018-04.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-04-15T10:30:00Z', 'roles': ['data']}
---
Asset: s2_2018-05:
{'href': 'https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/33N/16E-243N/33N_16E-243N_29_01/S2-SR/2018-05.tif', 'type': 'image/tiff; application=geotiff', 'datetime': '2018-05-15T10:30:00Z', 'roles': ['data']}
---
---


query_pfs2_hrefs = []
pf_asset_query_keys = [*role_filtered_assets.keys()]
s2_asset_query_keys = None

for pf_item in berlin_pf_items:

    # Filter Planet Fusion Assets
    for k in pf_asset_query_keys:
        query_pfs2_hrefs.append(pf_item.assets[k].href)

    # Filter Sentinel-2 Assets related to that Planet Fusion Item (i.e. is sampled at the same location)
    related_items = [
        link.resolve_stac_object().target
        for link in example_pf_item.links
        if link.rel == "related"
    ]

    for related_item in related_items:

        # Filter down to s2 items only (exclude labels for this example)
        if related_item.collection_id == "rapidai4eo_v1_source_s2":

            if s2_asset_query_keys is None:
                # This is our first look at s2 Assets, so the keys we want to access have not
                # yet been defined
                s2_asset_query_keys = [
                    k
                    for k, v in related_item.assets.items()
                    if start_date
                    < pd.to_datetime(v.extra_fields["datetime"])
                    < end_date
                ]
                # NOTE "role" filtering to only "data" is not necessary in the case of Sentinel-2
                # as no metadata files are linked to the STAC Items.

            for k in s2_asset_query_keys:
                query_pfs2_hrefs.append(related_item.assets[k].href)

print(f"Found {len(query_pfs2_hrefs):,} matching URLs.")

Found 1,386 matching URLs.


import gzip

import geopandas as gpd
import pandas as pd
import shapely.geometry

geometries_file = "rapidai4eo_geometries.geojson.gz"
labels_file = "rapidai4eo_labels.csv.gz"
labels_mapping_file = "rapidai4eo_label_mappings.csv"

geometries_file_url = "https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/resources/rapidai4eo_geometries.geojson.gz"
labels_file_url = "https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/resources/rapidai4eo_labels.csv.gz"
labels_mapping_file_url = "https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/resources/rapidai4eo_label_mappings.csv"


!curl {geometries_file_url} -o {geometries_file}
!curl {labels_file_url} -o {labels_file}
!curl {labels_mapping_file_url} -o {labels_mapping_file}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15.0M  100 15.0M    0     0  20.3M      0 --:--:-- --:--:-- --:--:-- 20.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8520k  100 8520k    0     0  12.2M      0 --:--:-- --:--:-- --:--:-- 12.2M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1759  100  1759    0     0  13126      0 --:--:-- --:--:-- --:--:-- 13029


%%time
with gzip.open(geometries_file) as fs:
    geometries = gpd.read_file(fs).set_index("sample_id")
labels = pd.read_csv(labels_file).set_index("sample_id")
labels_mapping = pd.read_csv(labels_mapping_file).set_index("index")

print("Geometries:")
print(geometries.head())
print("---")
print("Labels (columns truncated):")
print(labels[labels.columns[:6]].head())
print("---")

Geometries:
                                                             geometry
sample_id                                                            
35N_19E-292N_23_31  POLYGON ((26.49348 63.29178, 26.49338 63.29717...
35N_19E-292N_01_05  POLYGON ((26.17905 63.40880, 26.17889 63.41418...
35N_19E-292N_17_03  POLYGON ((26.15755 63.32251, 26.15739 63.32789...
35N_19E-292N_05_11  POLYGON ((26.25168 63.38765, 26.25154 63.39304...
35N_19E-292N_36_28  POLYGON ((26.45890 63.22165, 26.45880 63.22704...
---
Labels (columns truncated):
                    clc_111   clc_112  clc_121  clc_122  clc_123  clc_124
sample_id                                                                
31N_20E-211N_16_38      0.0  0.000000      0.0      0.0      0.0      0.0
35N_24E-173N_23_37      0.0  0.269575      0.0      0.0      0.0      0.0
32N_20E-205N_28_20      0.0  0.254225      0.0      0.0      0.0      0.0
33N_24E-237N_23_27      0.0  0.000000      0.0      0.0      0.0      0.0
33N_27E-224N_33_23      0.0  0.000000      0.0      0.0      0.0      0.0
---
CPU times: user 29.5 s, sys: 421 ms, total: 30 s
Wall time: 30 s


%%time
berlin_bbox = shapely.geometry.box(13.05, 52.35, 13.72, 52.69)
spatially_filtered_ids = geometries[geometries.geometry.intersects(berlin_bbox)].index
print(f"{spatially_filtered_ids.size} IDs found in the vicinity of Berlin.")
print("---")

198 IDs found in the vicinity of Berlin.
---
CPU times: user 123 ms, sys: 12 ms, total: 135 ms
Wall time: 134 ms


from rapidai4eo import get_asset_hrefs, PRODUCTS

print(PRODUCTS)

['pfsr', 'pfqa', 's2']


hrefs = get_asset_hrefs(spatially_filtered_ids)
print(
    f"{len(hrefs):,} image chips and quality assurance masks in our area of interest."
)

hrefs = get_asset_hrefs(spatially_filtered_ids, products=["pfsr", "s2"])
print(
    f"{len(hrefs):,} image chips in our area of interest, ignoring quality assurance masks"
)

hrefs = get_asset_hrefs(spatially_filtered_ids, products="s2")
print(f"{len(hrefs):,} Sentinel-2 image chips only in our area of interest.")

start_date = pd.to_datetime("2018-01-01T00:00:00Z")
end_date = pd.to_datetime("2019-01-01T00:00:00Z")
hrefs = get_asset_hrefs(
    spatially_filtered_ids,
    products=["pfsr", "s2"],
    temporal_filter=(start_date, end_date),
)
print(
    f"{len(hrefs):,} Planet Fusion and Sentinel-2 image chips in our area of interest and "
    "filtering by the time period for which both products are available (2018)."
)

60,192 image chips and quality assurance masks in our area of interest.
31,284 image chips in our area of interest, ignoring quality assurance masks
2,376 Sentinel-2 image chips only in our area of interest.
16,830 Planet Fusion and Sentinel-2 image chips in our area of interest and filtering by the time period for which both products are available (2018).


labels_mapping


peat_bog_samples = labels[labels["clc_412"] > 0.8].index
print(f"{peat_bog_samples.size:,}")

2,753


hrefs = get_asset_hrefs(peat_bog_samples, products="pfsr")
print(f"{len(hrefs):,} Planet Fusion image chips over peat bogs.")

401,938 Planet Fusion image chips over peat bogs.


eu_countries_file = (
    "https://gisco-services.ec.europa.eu/distribution/v2/nuts/topojson/"
    "NUTS_RG_20M_2021_4326_LEVL_0.json"
)
eu_countries = gpd.read_file(eu_countries_file).set_index("id")
germany_geometry = eu_countries.loc["DE", "geometry"]


%%time
# Filter by our country geometry
germany_samples = geometries[geometries.geometry.within(germany_geometry)].index

# At level-3, the CLC has three forest classes which all belong
# to CLC level-2 class 31.
forest_classes = labels_mapping[labels_mapping.clc_level2_id == 31].index.values

# Filter all available labels by our above spatial filter, keeping
# only relevant (forest class) columns
germany_labels = labels.loc[germany_samples][forest_classes]
# And now we can get the proportion of each location covered by any forest
# class simply by summing across columns.
germany_forest_label = germany_labels.sum(axis=1)

# Let's say we now want only those locations entirely covered by forest
germany_forest_samples = germany_forest_label[germany_forest_label == 1.0].index
print(
    f"Found {germany_forest_samples.size:,} samples in Germany entirely covered by forest (CLC class 31)."
)

# And finally we can select the URLs to download any product types we want
# Let's say again we want both image types, without the QA files, for the
# calendar year 2019.
start_date = pd.to_datetime("2018-01-01T00:00:00Z")
end_date = pd.to_datetime("2019-01-01T00:00:00Z")
hrefs = get_asset_hrefs(
    germany_forest_samples,
    products=["pfsr", "s2"],
    temporal_filter=(start_date, end_date),
)
print(
    f"{len(hrefs):,} Planet Fusion and Sentinel-2 image chips in our area of interest and "
    "filtering by the time period for which both products are available (2018)."
)

Found 3,214 samples in Germany entirely covered by forest (CLC class 31).
273,190 Planet Fusion and Sentinel-2 image chips in our area of interest and filtering by the time period for which both products are available (2018).
CPU times: user 4.55 s, sys: 15 ms, total: 4.56 s
Wall time: 4.56 s

	clc_level1_id	clc_level2_id	clc_level3_id	class_name
index
clc_111	1	11	111	Continuous urban fabric
clc_112	1	11	112	Discontinuous urban fabric
clc_121	1	12	121	Industrial or commercial units
clc_122	1	12	122	Road and rail networks and associated land
clc_123	1	12	123	Port areas
clc_124	1	12	124	Airports
clc_131	1	13	131	Mineral extraction sites
clc_132	1	13	132	Dump sites
clc_133	1	13	133	Construction sites
clc_141	1	14	141	Green urban areas
clc_142	1	14	142	Sport and leisure facilities
clc_211	2	21	211	Non-irrigated arable land
clc_212	2	21	212	Permanently irrigated land
clc_213	2	21	213	Rice fields
clc_221	2	22	221	Vineyards
clc_222	2	22	222	Fruit trees and berry plantations
clc_223	2	22	223	Olive groves
clc_231	2	23	231	Pastures
clc_241	2	24	241	Annual crops associated with permanent crops
clc_242	2	24	242	Complex cultivation patterns
clc_243	2	24	243	Land principally occupied by agriculture with ...
clc_244	2	24	244	Agro-forestry areas
clc_311	3	31	311	Broad-leaved forest
clc_312	3	31	312	Coniferous forest
clc_313	3	31	313	Mixed forest
clc_321	3	32	321	Natural grasslands
clc_322	3	32	322	Moors and heathland
clc_323	3	32	323	Sclerophyllous vegetation
clc_324	3	32	324	Transitional woodland-shrub
clc_331	3	33	331	Beaches dunes sands
clc_332	3	33	332	Bare rocks
clc_333	3	33	333	Sparsely vegetated areas
clc_334	3	33	334	Burnt areas
clc_335	3	33	335	Glaciers and perpetual snow
clc_411	4	41	411	Inland marshes
clc_412	4	41	412	Peat bogs
clc_421	4	42	421	Salt marshes
clc_422	4	42	422	Salines
clc_423	4	42	423	Intertidal flats
clc_511	5	51	511	Water courses
clc_512	5	51	512	Water bodies
clc_521	5	52	521	Coastal lagoons
clc_522	5	52	522	Estuaries
clc_523	5	52	523	Sea and ocean
clc_999	9	99	999	NODATA

Example	Prefix	UTM Zone	Tile offset	Sample ID	Product code	Date-derived basename
Sentinel-2 image	https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/	30N/	12E-191N/	30N_12E-191N_01_19/	S2-SR/	2018-01.tif
Planet Fusion image	https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/	30N/	12E-191N/	30N_12E-191N_01_19/	PF-SR/	2018-01-03.tif
Planet Fusion QA	https://radiantearth.blob.core.windows.net/mlhub/rapidai4eo/imagery/	30N/	12E-191N/	30N_12E-191N_01_19/	PF-QA/	2018-01-03.tif

RapidAI4EO Corpus Tutorial¶

Contents¶

1. Introduction¶

2. Downloading the corpus in its entirety¶

3. Querying with STAC¶

3.A. Spatial queries¶

3.B. Filtering Assets¶

4. Optimized queries¶

4.A. Spatio-temporal queries¶

4.B. Labels queries¶

5. Downloading subsets with azcopy¶

6. Conclusion¶

7. Appendix: Building asset URLs¶

RapidAI4EO Corpus Tutorial¶

Contents¶

1. Introduction¶

2. Downloading the corpus in its entirety¶

3. Querying with STAC¶

3.A. Spatial queries¶

3.B. Filtering Assets¶

3.C. Querying related Items¶

4. Optimized queries¶

4.A. Spatio-temporal queries¶

4.B. Labels queries¶

5. Downloading subsets with azcopy¶

6. Conclusion¶

7. Appendix: Building asset URLs¶