Adjust the readme

64a49f18 · abir.chebbi · a5728da8 · 64a49f18 · 64a49f18 · 64a49f18
Commit 64a49f18 authored 9 months ago by abir.chebbi
--- a/Part 1/create-S3-and-put-docs.py
+++ b/Part 1/create-S3-and-put-docs.py
@@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket):
                    Key=filename
                )
                print(f"{filename} uploaded successfully.")
-                
+
 def main(bucket_name, local_dir):
    s3_client = boto3.client('s3')
    create_bucket(s3_client, bucket_name)
@@ -32,8 +32,8 @@ def main(bucket_name, local_dir):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket")
-    parser.add_argument("bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
-    parser.add_argument("LOCAL_DIR", help="The name of the folder to put the pdf files")
+    parser.add_argument("--bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
+    parser.add_argument("--local_path", help="The name of the folder to put the pdf files")
    args = parser.parse_args()
-    main(args.bucket_name, args.LOCAL_DIR)
+    main(args.bucket_name, args.local_path)

--- a/Part 1/create-vector-db.py
+++ b/Part 1/create-vector-db.py
@@ -152,7 +152,7 @@ def main(collection_name,IAM_USER):

 if __name__== "__main__":
    parser = argparse.ArgumentParser(description="Create collection")
-    parser.add_argument("collection_name", help="The name of the collection")
-    parser.add_argument("iam_user", help="The iam user")
+    parser.add_argument("--collection_name", help="The name of the collection")
+    parser.add_argument("--iam_user", help="The iam user")
    args = parser.parse_args()
    main(args.collection_name,args.iam_user)
--- a/Part 1/main.py
+++ b/Part 1/main.py
@@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name):

 if __name__== "__main__":
    parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
-    parser.add_argument("bucket_name", help="The S3 bucket name where documents are stored")
-    parser.add_argument("endpoint", help="The OpenSearch service endpoint")
-    parser.add_argument("index_name", help="The name of the OpenSearch index")
+    parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
+    parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
+    parser.add_argument("--index_name", help="The name of the OpenSearch index")
    args = parser.parse_args()
    main(args.bucket_name, args.endpoint, args.index_name)
--- a/Part 2/main.py
+++ b/Part 2/main.py
@@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA
 from langchain_community.embeddings import BedrockEmbeddings
 from langchain_community.chat_models import BedrockChat
 from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
-
 from langchain import PromptTemplate
+import argparse
+
 # Embeddings Client
 bedrock_client = boto3.client(service_name="bedrock-runtime")

@@ -24,26 +25,26 @@ st.set_page_config(
 st.title("Chat with your lecture")


-# AWS and OpenSearch Configuration
-host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'  
-index_name = 'cloud_lecture'
-awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
+

 # OpenSearch Client
-opensearch_client = OpenSearch(
-    hosts=[{'host': host, 'port': 443}],
+def ospensearch_client(endpoint):
+    awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
+    client = OpenSearch(
+    hosts=[{'host': endpoint, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
-)
+    )
+    return client

 def get_embedding(question, bedrock_client):
    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
    embedding = embeddings_model.embed_query(question)
    return embedding

-def similarity_search(embed_query, index_name):
+def similarity_search(client, embed_query, index_name):
    query_body = {
        "size": 5,
        "query": {
@@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name):
            }
        }
    }
-    response = opensearch_client.search(index=index_name, body=query_body)
+    response = client.search(index=index_name, body=query_body)
    return response['hits']['hits']

 def prepare_prompt(question, context):
@@ -85,7 +86,9 @@ def generate_answer(prompt):
    return answer


-def main():
+def main(endpoint, index_name):
+
+    oss_client= ospensearch_client(endpoint)

    # initialize chat session in streamlit if not already present
    if "chat_history" not in st.session_state:
@@ -110,7 +113,7 @@ def main():
 
        embed_question= get_embedding(user_prompt,bedrock_client)
        print(embed_question)
-        sim_results = similarity_search(embed_question, index_name)
+        sim_results = similarity_search(oss_client, embed_question, index_name)
        context = [i['_source']['text'] for i in sim_results]
        print(context)
        prompt = prepare_prompt(user_prompt, context)
@@ -122,6 +125,11 @@ def main():
                st.markdown(message["content"])

 if __name__== "__main__":
-    main()
+    # Argument parsing
+    parser = argparse.ArgumentParser(description='Configure endpoint and index name for the lecture chat application.')
+    parser.add_argument('endpoint', type=str, help='The endpoint for the OpenSearch service.')
+    parser.add_argument('index_name', type=str, help='The index name for storing embeddings.')
+    args = parser.parse_args()
+    main(args.endpoint, args.index_name)

 
--- a/README.md
+++ b/README.md
 # chatbot-lab

 ## Set up environment
-1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
-2. Ensure python is installed: python 3.8 or higher
-2. Install required python libraries listed in the 'requirements.txt': 
+1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to the setup guide provided in Session 1).
+2. Ensure python is installed: python 3.8 or higher.
+3. Install required python libraries listed in the 'requirements.txt': 
 `pip3 install -r requirements.txt`


 ## Part 1: 

 ### Step 1: Object storage Creation
-Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session).
+Create an S3 bucket and upload a few PDF files by running: 
+`python create-S3-and-put-docs.py --bucket_name [YourBucketName] --local_path [PathToYourPDFFiles]`
+Where:
+`--bucket_name`: The name for the new S3 bucket to be created.
+`--local_path`: The local directory path where the PDF files are stored.
+

 ### Step 2: Vector Store Creation
-To set up the Vector Store, run the following command: `python create-vector-db.py`
+Create a vector database for storing embeddings by running: 
+`python create-vector-db.py --collection_name [Name_of_colletion] --IAM_user [YourIAM_User]`
+Where: 
+`--collection_name`: Name of the collection that you want to create to store embeddings.
+`--IAM_USER` : For example for group 14 the IAM USER = master-group-14
+

 This script performs the following actions:

-* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test".
-* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations.
-* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
+* Sets up encryption, network, and data access policies for the collection.
+* Creates a vector store with the name collection entered as argument.
+* After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.

 ### Step 3: Vectorizing the PDF Files
-After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files:
-* In main.py, update the S3 bucket name to the one you created.
-* Update the Vector Store endpoint with the one provided by the setup script.
-* Execute the processing script: `python main.py`
+After setting up the S3 bucket and Vector Store, we could process PDF files to generate and store embeddings in the vector database.
+
+Run: 
+`python main.py --bucket_name [YourBucketName] --endpoint [YourVectorDBEndpoint]`
+
+Where: 
+`--bucket_name`: The name of the S3 bucket containing the PDF files.
+`--endpoint`: Endpoint for the vector database.
+`--index_name`: The index_name where to store the embeddings in the collection.

 The main.py script will:
 1. Download PDF files from the S3 bucket.
 2. Split them into chunks.
 3. Generate embeddings from the chunks.
-4. Store these embeddings in the OpenSearch Vector DB.
+4. Create an index in the vector DB.
+5. Store these embeddings in the OpenSearch Vector DB.


 ## Part 2: