diff --git a/Part 1/Create-Vector-DB.py b/Part 1/Create-Vector-DB.py new file mode 100644 index 0000000000000000000000000000000000000000..4e63c5e1d1a89d0f4f6daa20883a2e0d52bea9c7 --- /dev/null +++ b/Part 1/Create-Vector-DB.py @@ -0,0 +1,155 @@ +## Source: https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-sdk.html +import boto3 +import botocore +import time + + +client = boto3.client('opensearchserverless') +service = 'aoss' +Vector_store_name='test1' + +def createEncryptionPolicy(client): + """Creates an encryption policy that matches all collections beginning with test""" + try: + response = client.create_security_policy( + description='Encryption policy for test collections', + name='test-policy', + policy=""" + { + \"Rules\":[ + { + \"ResourceType\":\"collection\", + \"Resource\":[ + \"collection\/test*\" + ] + } + ], + \"AWSOwnedKey\":true + } + """, + type='encryption' + ) + print('\nEncryption policy created:') + print(response) + except botocore.exceptions.ClientError as error: + if error.response['Error']['Code'] == 'ConflictException': + print( + '[ConflictException] The policy name or rules conflict with an existing policy.') + else: + raise error + + +def createNetworkPolicy(client): + """Creates a network policy that matches all collections beginning with test""" + try: + response = client.create_security_policy( + description='Network policy for Test collections', + name='test-policy', + policy=""" + [{ + \"Description\":\"Public access for Test collection\", + \"Rules\":[ + { + \"ResourceType\":\"dashboard\", + \"Resource\":[\"collection\/test*\"] + }, + { + \"ResourceType\":\"collection\", + \"Resource\":[\"collection\/test*\"] + } + ], + \"AllowFromPublic\":true + }] + """, + type='network' + ) + print('\nNetwork policy created:') + print(response) + except botocore.exceptions.ClientError as error: + if error.response['Error']['Code'] == 'ConflictException': + print( + '[ConflictException] A network policy with this name already exists.') + else: + raise error + + +def createAccessPolicy(client): + """Creates a data access policy that matches all collections beginning with test""" + try: + response = client.create_access_policy( + description='Data access policy for Test collections', + name='test-policy', + policy=""" + [{ + \"Rules\":[ + { + \"Resource\":[ + \"index\/test*\/*\" + ], + \"Permission\":[ + \"aoss:CreateIndex\", + \"aoss:DeleteIndex\", + \"aoss:UpdateIndex\", + \"aoss:DescribeIndex\", + \"aoss:ReadDocument\", + \"aoss:WriteDocument\" + ], + \"ResourceType\": \"index\" + }, + { + \"Resource\":[ + \"collection\/test*\" + ], + \"Permission\":[ + \"aoss:CreateCollectionItems\", + \"aoss:DeleteCollectionItems\", + \"aoss:UpdateCollectionItems\", + \"aoss:DescribeCollectionItems\" + ], + \"ResourceType\": \"collection\" + } + ], + \"Principal\":[ + \"arn:aws:iam::768034348959:user/AbirChebbi\" + ] + }] + """, + type='data' + ) + print('\nAccess policy created:') + print(response) + except botocore.exceptions.ClientError as error: + if error.response['Error']['Code'] == 'ConflictException': + print( + '[ConflictException] An access policy with this name already exists.') + else: + raise error + + + +def waitForCollectionCreation(client): + """Waits for the collection to become active""" + time.sleep(40) + response = client.batch_get_collection( + names=['test1']) + print('\nCollection successfully created:') + print(response["collectionDetails"]) + # Extract the collection endpoint from the response + host = (response['collectionDetails'][0]['collectionEndpoint']) + final_host = host.replace("https://", "") + return final_host + + +def main(): + + createEncryptionPolicy(client) + createNetworkPolicy(client) + createAccessPolicy(client) + collection = client.create_collection(name=Vector_store_name,type='VECTORSEARCH') + ENDPOINT= waitForCollectionCreation(client) + + print("Collection created successfully:", collection) + print("Collection ENDPOINT:", ENDPOINT) + +if __name__== "__main__": + main() \ No newline at end of file diff --git a/Part 1/Delete-Vector-DB.py b/Part 1/Delete-Vector-DB.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Part 1/Delete-s3.py b/Part 1/Delete-s3.py new file mode 100644 index 0000000000000000000000000000000000000000..7d07f21c483b427cc800d4888fbde66e2c373b2b --- /dev/null +++ b/Part 1/Delete-s3.py @@ -0,0 +1,24 @@ +import boto3 + +BUCKET_NAME = 'cloud-lecture-2023' + +S3_CLIENT = boto3.client('s3') +S3_RESOURCE = boto3.resource('s3') + +# # # Delete Bucket + +# First, delete all objects in the Bucket +bucket = S3_RESOURCE.Bucket(BUCKET_NAME) + +print("Deleting all objects in Bucket\n") +bucket.objects.all().delete() + + +print("Deleting Bucket") +# Bucket Deletion +response = S3_CLIENT.delete_bucket( + Bucket=BUCKET_NAME + +) + +print(response) diff --git a/Part 1/create-S3-and-put-docs.py b/Part 1/create-S3-and-put-docs.py new file mode 100644 index 0000000000000000000000000000000000000000..7587e0c415b746d955d4e0b76d39ddc4886d20a2 --- /dev/null +++ b/Part 1/create-S3-and-put-docs.py @@ -0,0 +1,35 @@ +import boto3 +import os + +LOCAL_DIR = "pdfs" +BUCKET_NAME = 'cloud-lecture-2023' + +# Initiate S3 client +s3_client = boto3.client('s3') + +# Create S3 Bucket +print("Creating Bucket") +response = s3_client.create_bucket( + Bucket=BUCKET_NAME, +) +print(response) +print() + +# Function to write files to S3 +def write_files(directory, bucket): + for filename in os.listdir(directory): + if filename.endswith(".pdf"): # Check if the file is a PDF + file_path = os.path.join(directory, filename) + with open(file_path, 'rb') as file: + print(f"Uploading {filename} to bucket {bucket}...") + s3_client.put_object( + Body=file, + Bucket=bucket, + Key=filename + ) + print(f"{filename} uploaded successfully.") + +# Upload PDF files to S3 bucket +print("Writing Items to Bucket") +write_files(LOCAL_DIR, BUCKET_NAME) + diff --git a/Part 1/main.py b/Part 1/main.py index b318c014a7df50665a122fd3c6dd718577873f0d..f12dafd00a2b2862f5829481c4d21e8d8518654e 100644 --- a/Part 1/main.py +++ b/Part 1/main.py @@ -14,7 +14,7 @@ index_name = "cloud_lecture_test" ## S3_client s3_client = boto3.client('s3') ## Bucket name where documents are stored -BUCKET_NAME = "chatbotlab" +BUCKET_NAME = "cloud-lecture-2023" ## Bedrock client bedrock_client = boto3.client(service_name="bedrock-runtime") @@ -25,7 +25,7 @@ credentials = boto3.Session().get_credentials() awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss') ## Vector DB endpoint -host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com' +host= 'ispfynbvy6eov4efdsqd.us-east-1.aoss.amazonaws.com' ## Opensearch Client OpenSearch_client = OpenSearch( @@ -91,8 +91,12 @@ def split_text(pages, chunk_size, chunk_overlap, local_dir): return chunks ## Generate embeddings and index them using Opensearch +# def generate_embeddings(): -def generate_embeddings(bedrock_client, chunks,awsauth,index_name): +# def store_embeddings(): + + +def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name): embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) docsearch = OpenSearchVectorSearch.from_documents( chunks, @@ -117,7 +121,7 @@ def main(): chunks=split_text(docs, 1000, 100, LOCAL_DIR) print("Sample chunk:", chunks[0]) create_index(index_name) - embeddings = generate_embeddings(bedrock_client, chunks,awsauth,index_name) + embeddings = generate_store_embeddings(bedrock_client, chunks,awsauth,index_name) print("Embeddings processing completed", embeddings)