[NO ISSUE]: Support Reading Single Depth Files/Folder from S3 Container
Change-Id: Iae9d85eb9899419e63c86322cd8da2273adf89c6
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18206
Reviewed-by: Utsav Singh <utsav.singh@couchbase.com>
Reviewed-by: Hussain Towaileb <hussainht@gmail.com>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
index 55a515c..34d209e 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
@@ -20,6 +20,7 @@
import static org.apache.asterix.test.external_dataset.avro.AvroFileConverterUtil.AVRO_GEN_BASEDIR;
import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER;
+import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BROWSE_CONTAINER;
import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.DYNAMIC_PREFIX_AT_START_CONTAINER;
import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER;
import static org.apache.asterix.test.external_dataset.parquet.BinaryFileConverterUtil.BINARY_GEN_BASEDIR;
@@ -78,6 +79,7 @@
private static Uploader fixedDataLoader;
private static Uploader mixedDataLoader;
private static Uploader bomFileLoader;
+ private static Uploader browseDataLoader;
protected TestCaseContext tcCtx;
@@ -148,6 +150,16 @@
ExternalDatasetTestUtils.bomFileLoader = bomFileLoader;
}
+ public static void setUploaders(Uploader playgroundDataLoader, Uploader dynamicPrefixAtStartDataLoader,
+ Uploader fixedDataLoader, Uploader mixedDataLoader, Uploader bomFileLoader, Uploader browseDataLoader) {
+ ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader;
+ ExternalDatasetTestUtils.dynamicPrefixAtStartDataLoader = dynamicPrefixAtStartDataLoader;
+ ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader;
+ ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader;
+ ExternalDatasetTestUtils.bomFileLoader = bomFileLoader;
+ ExternalDatasetTestUtils.browseDataLoader = browseDataLoader;
+ }
+
/**
* Creates a bucket and fills it with some files for testing purpose.
*/
@@ -183,6 +195,32 @@
LOGGER.info("Files added successfully");
}
+ public static void prepareBrowseContainer() {
+ /*
+ file hierarchy inside browse container
+ browse/1.json
+ browse/2.json
+ browse/level1/3.json
+ browse/level1/4.json
+ browse/level1/level2/5.json
+ browse/level2/level3/6.json
+ */
+ // -- todo:Utsav add a test for Browse S3 path which returns multiple folders, skipped for now as S3 mock server does not support this.
+ LOGGER.info("Adding JSON files to " + BROWSE_CONTAINER);
+ browseDataLoader.upload("1.json", "{\"id\":" + 1 + "}");
+ browseDataLoader.upload("2.json", "{\"id\":" + 2 + "}");
+ browseDataLoader.upload("level1/3.json", "{\"id\":" + 3 + "}");
+ browseDataLoader.upload("level1/4.json", "{\"id\":" + 4 + "}");
+ browseDataLoader.upload("level1/level2/5.json", "{\"id\":" + 5 + "}");
+ browseDataLoader.upload("level2/level3/6.json", "{\"id\":" + 6 + "}");
+
+ //Adding 1000+ files
+ for (int i = 1; i <= 1500; i++) {
+ browseDataLoader.upload("level3/" + i + ".json", "{\"id\":" + i + "}");
+ }
+ LOGGER.info("JSON Files added successfully");
+ }
+
/**
* Special container where dynamic prefix is the first segment
*/
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
index 7912d57..9a2bfbc 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
@@ -147,6 +147,7 @@
protected TestCaseContext tcCtx;
public static final String PLAYGROUND_CONTAINER = "playground";
+ public static final String BROWSE_CONTAINER = "browse";
public static final String DYNAMIC_PREFIX_AT_START_CONTAINER = "dynamic-prefix-at-start-container";
public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not use, has fixed data
public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude";
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java
index 45e83b4..6a16913 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/aws/s3/S3Utils.java
@@ -48,6 +48,7 @@
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -81,6 +82,7 @@
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.S3ClientBuilder;
+import software.amazon.awssdk.services.s3.model.CommonPrefix;
import software.amazon.awssdk.services.s3.model.ListObjectsRequest;
import software.amazon.awssdk.services.s3.model.ListObjectsResponse;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
@@ -88,6 +90,7 @@
import software.amazon.awssdk.services.s3.model.S3Exception;
import software.amazon.awssdk.services.s3.model.S3Object;
import software.amazon.awssdk.services.s3.model.S3Response;
+import software.amazon.awssdk.services.s3.paginators.ListObjectsV2Iterable;
public class S3Utils {
private S3Utils() {
@@ -519,4 +522,58 @@
}
}
}
+
+ public static Map<String, List<String>> S3ObjectsOfSingleDepth(Map<String, String> configuration, String container,
+ String prefix) throws CompilationException, HyracksDataException {
+ // create s3 client
+ S3Client s3Client = buildAwsS3Client(configuration);
+ // fetch all the s3 objects
+ return listS3ObjectsOfSingleDepth(s3Client, container, prefix);
+ }
+
+ /**
+ * Uses the latest API to retrieve the objects from the storage of a single level.
+ *
+ * @param s3Client S3 client
+ * @param container container name
+ * @param prefix definition prefix
+ */
+ private static Map<String, List<String>> listS3ObjectsOfSingleDepth(S3Client s3Client, String container,
+ String prefix) throws HyracksDataException {
+ Map<String, List<String>> allObjects = new HashMap<>();
+ ListObjectsV2Iterable listObjectsInterable;
+ ListObjectsV2Request.Builder listObjectsBuilder =
+ ListObjectsV2Request.builder().bucket(container).prefix(prefix).delimiter("/");
+ listObjectsBuilder.prefix(prefix);
+ List<String> files = new ArrayList<>();
+ List<String> folders = new ArrayList<>();
+ // to skip the prefix as a file from the response
+ boolean checkPrefixInFile = true;
+ listObjectsInterable = s3Client.listObjectsV2Paginator(listObjectsBuilder.build());
+ for (ListObjectsV2Response response : listObjectsInterable) {
+ // put all the files
+ for (S3Object object : response.contents()) {
+ String fileName = object.key();
+ fileName = fileName.substring(prefix.length(), fileName.length());
+ if (checkPrefixInFile) {
+ if (prefix.equals(object.key()))
+ checkPrefixInFile = false;
+ else {
+ files.add(fileName);
+ }
+ } else {
+ files.add(fileName);
+ }
+ }
+ // put all the folders
+ for (CommonPrefix object : response.commonPrefixes()) {
+ String folderName = object.prefix();
+ folderName = folderName.substring(prefix.length(), folderName.length());
+ folders.add(folderName.endsWith("/") ? folderName.substring(0, folderName.length() - 1) : folderName);
+ }
+ }
+ allObjects.put("files", files);
+ allObjects.put("folders", folders);
+ return allObjects;
+ }
}