使用gdal批量预处理Sentinel2 L2A数据集并输出为TIFF文件

我的需求

简单地说下我的需求，我是计算机专硕，最近遇到要处理和下载哨兵2，但是我看到网上的处理方法大多需要下载软件才能进行批量化处理，十分麻烦。经过我研究，写了一个使用gdal的python代码可以进行批量提取真彩色波段并输出为TIFF文件。

如果帮到大家，希望点个赞谢谢啦 Ciallo～(∠・ω< )。

那么如何批量处理Sentinel-2 L2A数据集？

这是下载好的Sentinel-2文件如下：

下面简单地介绍一下代码的功能，这段代码主要是处理 Sentinel-2 卫星影像数据的压缩文件（.zip 或 .tar.gz 格式），将其解压后，提取和处理波段数据：

1. 解压压缩包 (`uncompress`)

该函数支持 .zip 和 .tar.gz 格式的压缩包，解压后将文件提取到指定的输出目录。
input_path 是输入的压缩文件路径，output_dir 是解压后的目标文件夹。

def uncompress(input_path, output_dir):
    """解压Sentinel-2压缩包（支持.zip和.tar.gz格式）"""
    if input_path.endswith('.zip'):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
    elif input_path.endswith('.tar.gz') or input_path.endswith('.tgz'):
        with tarfile.open(input_path, 'r:gz') as tar_ref:
            tar_ref.extractall(output_dir)
    else:
        raise ValueError("不支持的压缩格式，仅支持.zip和.tar.gz")
    print()
    print(f"解压完成，文件保存在：{output_dir}")

2. 查找 `.jp2` 文件 (`find_jp2_files`)

这个函数用于遍历解压后的文件夹，查找所有 .jp2 格式的影像文件。
通过正则表达式从文件名中提取波段号（如 B02, B03 等）和分辨率（如 10m, 20m 等）信息，并存储为字典。

def find_jp2_files(input_dir):
    """遍历目录查找所有.jp2文件并提取波段和分辨率信息"""
    band_files = {}
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.jp2'):
                # 使用正则表达式匹配波段和分辨率
                match = re.match(r'.*_B(\d{2})_(\d+m)\.jp2$', file)
                if match:
                    band = match.group(1)
                    res = match.group(2)
                    full_path = os.path.join(root, file)
                    if band not in band_files:
                        band_files[band] = []
                    band_files[band].append({'path': full_path, 'res': res})
    return band_files

3. 重采样波段 (`resample_band`)

该函数用来将某个波段的分辨率重采样到目标分辨率（如 10m），如果原始分辨率与目标分辨率不匹配的话。
使用 GDAL 的 gdal.Warp 函数进行重采样，并返回重采样后的 TIFF 文件路径。

def resample_band(input_path, target_res, output_path=None):
    """将输入波段重采样到目标分辨率"""
    if not output_path:
        output_path = tempfile.mktemp(suffix='.tif')

    res_value = int(target_res[:-1])  # 从'10m'中提取数字10
    gdal.Warp(
        output_path,
        input_path,
        xRes=res_value,
        yRes=res_value,
        resampleAlg=gdal.GRA_Bilinear
    )
    return output_path

4. 处理和合并波段 (`process_to_tif`)

这个函数用于处理解压后的 Sentinel-2 数据，将其转换为一个多波段的GeoTIFF文件。
它会根据指定的波段（例如，红色波段 B04、绿色波段 B03、蓝色波段 B02），找到对应的 .jp2 文件并确保它们有相同的分辨率。
如果某个波段的分辨率不匹配，代码会自动进行重采样。
使用 GDAL 的 BuildVRT 和 Translate 函数将波段合并为一个多波段 TIFF 文件，并应用压缩和瓦片化。

def process_to_tif(input_dir, output_path, target_res='10m', bands=None):
    """
    处理Sentinel-2 L2A数据并生成多波段TIFF
    :param input_dir: 解压后的SAFE目录路径
    :param output_path: 输出TIFF路径
    :param target_res: 目标分辨率（默认为10m）
    :param bands: 要合成的波段列表（默认按数字顺序处理所有波段）
    """
    # 查找所有波段文件
    band_files = find_jp2_files(input_dir)

    # 确定要处理的波段（默认处理所有找到的波段）
    selected_bands = sorted(band_files.keys()) if bands is None else [str(b).zfill(2) for b in bands]

    # 处理每个波段
    temp_files = []
    processed_bands = []

    for band in selected_bands:
        if band not in band_files:
            print(f"警告：未找到波段B{band}，已跳过")
            continue

        # 查找最佳分辨率文件
        candidates = band_files[band]
        target_file = next((f for f in candidates if f['res'] == target_res), None)

        if target_file:
            src_path = target_file['path']
        else:
            # 选择最高分辨率文件进行重采样
            src_path = sorted(candidates, key=lambda x: int(x['res'][:-1]))[0]['path']
            temp_path = tempfile.mktemp(suffix=f'_B{band}_{target_res}.tif')
            resample_band(src_path, target_res, temp_path)
            src_path = temp_path
            temp_files.append(src_path)

        processed_bands.append(src_path)

    # 合并波段为多波段TIFF
    if not processed_bands:
        raise ValueError("没有找到有效的波段数据进行处理")

    # 使用VRT构建虚拟数据集
    vrt_path = tempfile.mktemp(suffix='.vrt')
    vrt_options = gdal.BuildVRTOptions(separate=True)
    gdal.BuildVRT(vrt_path, processed_bands, options=vrt_options)

    # 转换为GeoTIFF
    translate_options = gdal.TranslateOptions(
        format='GTiff',
        creationOptions=['COMPRESS=DEFLATE', 'TILED=YES']
    )
    gdal.Translate(output_path, vrt_path, options=translate_options)

    # 清理临时文件
    for f in temp_files + [vrt_path]:
        try:
            os.remove(f)
        except:
            pass

5. 批量处理多个 `.SAFE.zip` 文件 (`process_all_safe_zips`)

该函数用于遍历指定目录下的所有 .SAFE.zip 文件，并依次进行解压、处理，最终将每个 .SAFE.zip 文件转换为一个多波段的GeoTIFF。
每个 .SAFE.zip 文件代表一个 Sentinel-2 影像数据集，经过解压后会生成一个包含波段数据的 SAFE 文件夹。
处理后的 TIFF 文件会保存在输出目录的 img 文件夹中，文件名会与输入的 .SAFE.zip 文件名相对应。

def process_all_safe_zips(result_dir):
    """遍历result_dir下的所有.SAFE.zip文件并处理它们"""
    for root, _, files in os.walk(result_dir):
        for file in tqdm(files):
            if file.endswith('.SAFE.zip'):
                input_zip = os.path.join(root, file)
                output_dir = os.path.join(result_dir, 'output')

                # 提取文件名（去掉 .SAFE.zip 后缀）
                file_name = os.path.splitext(file)[0].replace('.SAFE', '')

                # 生成最终的 TIFF 文件名
                img_dir = os.path.join(output_dir, 'img')
                final_tif = os.path.join(img_dir, f'{file_name}.tif')
                print(f"处理文件: {input_zip} -> {final_tif}")

                # 创建输出目录
                os.makedirs(output_dir, exist_ok=True)
                os.makedirs(img_dir, exist_ok=True)

                # 步骤1：解压文件
                uncompress(input_zip, output_dir)

                # 步骤2：处理数据（假设解压后生成SAFE目录）
                safe_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(input_zip))[0])
                process_to_tif(
                    input_dir=safe_dir,
                    output_path=final_tif,
                    target_res='10m',
                    # bands=['02', '03', '04', '08']  # 可自定义需要合成的波段
                    bands = ['04', '03', '02']  # 真彩色波段顺序：红(B04)、绿(B03)、蓝(B02)
                )

                print(f"处理完成，结果已保存至：{final_tif}")

6. 代码运行效果图

省流版：直接看代码集合（点赞关注一下，谢谢）

import os
import re
import zipfile
import tarfile
import tempfile
from osgeo import gdal
from tqdm import tqdm, trange

def uncompress(input_path, output_dir):
    """解压Sentinel-2压缩包（支持.zip和.tar.gz格式）"""
    if input_path.endswith('.zip'):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)
    elif input_path.endswith('.tar.gz') or input_path.endswith('.tgz'):
        with tarfile.open(input_path, 'r:gz') as tar_ref:
            tar_ref.extractall(output_dir)
    else:
        raise ValueError("不支持的压缩格式，仅支持.zip和.tar.gz")
    print()
    print(f"解压完成，文件保存在：{output_dir}")


def find_jp2_files(input_dir):
    """遍历目录查找所有.jp2文件并提取波段和分辨率信息"""
    band_files = {}
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.jp2'):
                # 使用正则表达式匹配波段和分辨率
                match = re.match(r'.*_B(\d{2})_(\d+m)\.jp2$', file)
                if match:
                    band = match.group(1)
                    res = match.group(2)
                    full_path = os.path.join(root, file)
                    if band not in band_files:
                        band_files[band] = []
                    band_files[band].append({'path': full_path, 'res': res})
    return band_files


def resample_band(input_path, target_res, output_path=None):
    """将输入波段重采样到目标分辨率"""
    if not output_path:
        output_path = tempfile.mktemp(suffix='.tif')

    res_value = int(target_res[:-1])  # 从'10m'中提取数字10
    gdal.Warp(
        output_path,
        input_path,
        xRes=res_value,
        yRes=res_value,
        resampleAlg=gdal.GRA_Bilinear
    )
    return output_path


def process_to_tif(input_dir, output_path, target_res='10m', bands=None):
    """
    处理Sentinel-2 L2A数据并生成多波段TIFF
    :param input_dir: 解压后的SAFE目录路径
    :param output_path: 输出TIFF路径
    :param target_res: 目标分辨率（默认为10m）
    :param bands: 要合成的波段列表（默认按数字顺序处理所有波段）
    """
    # 查找所有波段文件
    band_files = find_jp2_files(input_dir)

    # 确定要处理的波段（默认处理所有找到的波段）
    selected_bands = sorted(band_files.keys()) if bands is None else [str(b).zfill(2) for b in bands]

    # 处理每个波段
    temp_files = []
    processed_bands = []

    for band in selected_bands:
        if band not in band_files:
            print(f"警告：未找到波段B{band}，已跳过")
            continue

        # 查找最佳分辨率文件
        candidates = band_files[band]
        target_file = next((f for f in candidates if f['res'] == target_res), None)

        if target_file:
            src_path = target_file['path']
        else:
            # 选择最高分辨率文件进行重采样
            src_path = sorted(candidates, key=lambda x: int(x['res'][:-1]))[0]['path']
            temp_path = tempfile.mktemp(suffix=f'_B{band}_{target_res}.tif')
            resample_band(src_path, target_res, temp_path)
            src_path = temp_path
            temp_files.append(src_path)

        processed_bands.append(src_path)

    # 合并波段为多波段TIFF
    if not processed_bands:
        raise ValueError("没有找到有效的波段数据进行处理")

    # 使用VRT构建虚拟数据集
    vrt_path = tempfile.mktemp(suffix='.vrt')
    vrt_options = gdal.BuildVRTOptions(separate=True)
    gdal.BuildVRT(vrt_path, processed_bands, options=vrt_options)

    # 转换为GeoTIFF
    translate_options = gdal.TranslateOptions(
        format='GTiff',
        creationOptions=['COMPRESS=DEFLATE', 'TILED=YES']
    )
    gdal.Translate(output_path, vrt_path, options=translate_options)

    # 清理临时文件
    for f in temp_files + [vrt_path]:
        try:
            os.remove(f)
        except:
            pass


def process_all_safe_zips(result_dir):
    """遍历result_dir下的所有.SAFE.zip文件并处理它们"""
    for root, _, files in os.walk(result_dir):
        for file in tqdm(files):
            if file.endswith('.SAFE.zip'):
                input_zip = os.path.join(root, file)
                output_dir = os.path.join(result_dir, 'output')

                # 提取文件名（去掉 .SAFE.zip 后缀）
                file_name = os.path.splitext(file)[0].replace('.SAFE', '')

                # 生成最终的 TIFF 文件名
                img_dir = os.path.join(output_dir, 'img')
                final_tif = os.path.join(img_dir, f'{file_name}.tif')
                print(f"处理文件: {input_zip} -> {final_tif}")

                # 创建输出目录
                os.makedirs(output_dir, exist_ok=True)
                os.makedirs(img_dir, exist_ok=True)

                # 步骤1：解压文件
                uncompress(input_zip, output_dir)

                # 步骤2：处理数据（假设解压后生成SAFE目录）
                safe_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(input_zip))[0])
                process_to_tif(
                    input_dir=safe_dir,
                    output_path=final_tif,
                    target_res='10m',
                    # bands=['02', '03', '04', '08']  # 可自定义需要合成的波段
                    bands = ['04', '03', '02']  # 真彩色波段顺序：红(B04)、绿(B03)、蓝(B02)
                )

                print(f"处理完成，结果已保存至：{final_tif}")


if __name__ == '__main__':
    # 使用示例
    result_dir = r"E:\BaiduNetdiskDownload\Tajikistan_186_tianditu\img_8_10\1"
    process_all_safe_zips(result_dir)

使用gdal批量预处理Sentinel2 L2A数据集并输出为TIFF文件

我的需求

那么如何批量处理Sentinel-2 L2A数据集？

1. 解压压缩包 (uncompress)

2. 查找 .jp2 文件 (find_jp2_files)

3. 重采样波段 (resample_band)

4. 处理和合并波段 (process_to_tif)

5. 批量处理多个 .SAFE.zip 文件 (process_all_safe_zips)