Skip to content

Commit f6d905a

Browse files
added the updated script existing index_datasets script to create the index
1 parent 75259d9 commit f6d905a

File tree

4 files changed

+90
-29
lines changed

4 files changed

+90
-29
lines changed

infra/scripts/Process-Sample-Data.ps1

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,11 @@ if ($ResourceGroup) {
119119
}
120120

121121
# Upload CSV files
122-
Write-Host "Uploading CSV files to blob storage..."
122+
Write-Host "Uploading CSV and JSON files to blob storage..."
123123
az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.csv" --overwrite --output none
124-
if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV files."; exit 1 }
125-
Write-Host "CSV files uploaded successfully."
124+
az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.json" --overwrite --output none
125+
if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV and JSON files."; exit 1 }
126+
Write-Host "CSV and JSON files uploaded successfully."
126127

127128
# Upload PDF files
128129
Write-Host "Uploading PDF files from RFP_dataset to blob storage..."
@@ -180,21 +181,29 @@ Write-Host "Installing requirements"
180181
pip install --quiet -r infra/scripts/requirements.txt
181182
Write-Host "Requirements installed"
182183

183-
# Run indexing scripts
184-
if ($hasCsv) {
185-
Write-Host "Running the python script to index CSV data"
186-
& $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
187-
if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 }
188-
}
189-
if ($hasPdf) {
190-
Write-Host "Running the python script to index PDF data"
191-
& $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
192-
if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 }
193-
}
194-
if (-not $hasCsv -and -not $hasPdf) {
195-
Write-Host "No CSV or PDF files found to index."
184+
Write-Host "Running the python script to index data"
185+
$process = Start-Process -FilePath $pythonCmd -ArgumentList "infra/scripts/index_datasets.py", $StorageAccount, $BlobContainer, $AiSearch, $AiSearchIndex -Wait -NoNewWindow -PassThru
186+
187+
if ($process.ExitCode -ne 0) {
188+
Write-Host "Error: Indexing python script execution failed."
189+
exit 1
196190
}
197191

192+
# Run indexing scripts
193+
# if ($hasCsv) {
194+
# Write-Host "Running the python script to index CSV data"
195+
# & $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
196+
# if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 }
197+
# }
198+
# if ($hasPdf) {
199+
# Write-Host "Running the python script to index PDF data"
200+
# & $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
201+
# if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 }
202+
# }
203+
# if (-not $hasCsv -and -not $hasPdf) {
204+
# Write-Host "No CSV or PDF files found to index."
205+
# }
206+
198207
# Disable public access again
199208
if ($stIsPublicAccessDisabled) {
200209
Write-Host "Disabling public access for storage account: $StorageAccount"

infra/scripts/index_datasets.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,49 @@
55
from azure.storage.blob import BlobServiceClient
66
import sys
77

8+
# PDF text extraction function
9+
def extract_pdf_text(pdf_bytes):
10+
"""Extract text content from PDF bytes using PyPDF2"""
11+
try:
12+
import PyPDF2
13+
import io
14+
15+
pdf_file = io.BytesIO(pdf_bytes)
16+
pdf_reader = PyPDF2.PdfReader(pdf_file)
17+
18+
# Check if PDF is encrypted/protected
19+
if pdf_reader.is_encrypted:
20+
return "PDF_PROTECTED: This PDF document is password-protected or encrypted and cannot be processed."
21+
22+
text_content = []
23+
for page in pdf_reader.pages:
24+
try:
25+
page_text = page.extract_text()
26+
if page_text and page_text.strip():
27+
text_content.append(page_text)
28+
except Exception:
29+
continue
30+
31+
full_text = "\n".join(text_content).strip()
32+
33+
# Check for protection messages
34+
protection_indicators = [
35+
"protected by Microsoft Office",
36+
"You'll need a different reader",
37+
"Download a compatible PDF reader",
38+
"This PDF Document has been protected"
39+
]
40+
41+
if any(indicator.lower() in full_text.lower() for indicator in protection_indicators):
42+
return "PDF_PROTECTED: This PDF document appears to be protected or encrypted."
43+
44+
return full_text if full_text else "PDF_NO_TEXT: No readable text content found in PDF."
45+
46+
except ImportError:
47+
return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2"
48+
except Exception as e:
49+
return f"PDF_ERROR: Error reading PDF content: {str(e)}"
50+
851
if len(sys.argv) < 4:
952
print("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint> [<ai_search_index_name>]")
1053
sys.exit(1)
@@ -51,11 +94,19 @@
5194
#if blob.name.endswith(".csv"):
5295
title = blob.name.replace(".csv", "")
5396
title = blob.name.replace(".json", "")
97+
title = blob.name.replace(".pdf", "") # Also handle PDF extension
5498
data = container_client.download_blob(blob.name).readall()
5599

56100
try:
57101
print(f"Reading data from blob: {blob.name}...")
58-
text = data.decode('utf-8')
102+
103+
# Check if this is a PDF file and process accordingly
104+
if blob.name.lower().endswith('.pdf'):
105+
text = extract_pdf_text(data)
106+
else:
107+
# Original processing for non-PDF files
108+
text = data.decode('utf-8')
109+
59110
data_list.append({
60111
"content": text,
61112
"id": str(idx),

infra/scripts/process_sample_data.sh

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,14 @@ fi
123123

124124

125125
#Upload sample CSV files to blob storage
126-
echo "Uploading CSV sample files to blob storage..."
126+
echo "Uploading CSV and JSON sample files to blob storage..."
127127
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.csv' --overwrite --output none
128+
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.json' --overwrite --output none
128129
if [ $? -ne 0 ]; then
129-
echo "Error: Failed to upload CSV files to blob storage."
130+
echo "Error: Failed to upload CSV and JSON files to blob storage."
130131
exit 1
131132
fi
132-
echo "CSV files uploaded successfully to blob storage."
133+
echo "CSV and JSON files uploaded successfully to blob storage."
133134

134135
#Upload PDF files from RFP_dataset to blob storage
135136
echo "Uploading PDF files from RFP_dataset to blob storage..."
@@ -194,14 +195,14 @@ if [ "$has_csv" = true ]; then
194195
fi
195196
fi
196197

197-
if [ "$has_pdf" = true ]; then
198-
echo "Running the python script to index PDF data"
199-
$PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
200-
if [ $? -ne 0 ]; then
201-
echo "Error: PDF indexing python script execution failed."
202-
exit 1
203-
fi
204-
fi
198+
# if [ "$has_pdf" = true ]; then
199+
# echo "Running the python script to index PDF data"
200+
# $PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
201+
# if [ $? -ne 0 ]; then
202+
# echo "Error: PDF indexing python script execution failed."
203+
# exit 1
204+
# fi
205+
# fi
205206

206207
if [ "$has_csv" = false ] && [ "$has_pdf" = false ]; then
207208
echo "No CSV or PDF files found to index."

infra/scripts/upload_team_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
print(f"Scanning directory: {directory_path}")
1616

1717
files_to_process = [
18-
("RFP_Analysis_team", "00000000-0000-0000-0000-000000000001"),
18+
("RFP_Analysis_team.json", "00000000-0000-0000-0000-000000000001"),
1919
("hr.json", "00000000-0000-0000-0000-000000000002"),
2020
("marketing.json", "00000000-0000-0000-0000-000000000003"),
2121
("retail.json", "00000000-0000-0000-0000-000000000004"),

0 commit comments

Comments
 (0)