Skip to content

Commit 033c6a5

Browse files
authored
Merge pull request #116 from DarkRaiderCB/CRAG-Firecrawl-LMStudio
CRAG using FireCrawl and LMStudio
2 parents e1ab953 + fe2a41f commit 033c6a5

File tree

10 files changed

+3687
-0
lines changed

10 files changed

+3687
-0
lines changed

firecrawl-agent/.gitignore

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv
11+
*.env

firecrawl-agent/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11

firecrawl-agent/README.md

Whitespace-only changes.

firecrawl-agent/app.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
from contextlib import redirect_stdout
2+
import io
3+
from workflow import CorrectiveRAGWorkflow
4+
from llama_index.core import Settings
5+
from llama_index.embeddings.fastembed import FastEmbedEmbedding
6+
from llama_index.vector_stores.qdrant import QdrantVectorStore
7+
from llama_index.llms.lmstudio import LMStudio
8+
from llama_index.core import StorageContext
9+
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
10+
from IPython.display import Markdown, display
11+
import time
12+
import uuid
13+
import tempfile
14+
import gc
15+
import base64
16+
import qdrant_client
17+
import streamlit as st
18+
import asyncio
19+
import os
20+
import sys
21+
import logging
22+
from dotenv import load_dotenv
23+
import nest_asyncio
24+
nest_asyncio.apply()
25+
26+
load_dotenv()
27+
28+
29+
# Set up page configuration
30+
st.set_page_config(page_title="Corrective RAG Demo", layout="wide")
31+
32+
# Initialize session state variables
33+
if "id" not in st.session_state:
34+
st.session_state.id = uuid.uuid4()
35+
st.session_state.file_cache = {}
36+
37+
if "workflow" not in st.session_state:
38+
st.session_state.workflow = None
39+
40+
if "messages" not in st.session_state:
41+
st.session_state.messages = []
42+
43+
if "workflow_logs" not in st.session_state:
44+
st.session_state.workflow_logs = []
45+
46+
session_id = st.session_state.id
47+
48+
49+
@st.cache_resource
50+
def load_llm():
51+
llm = LMStudio(
52+
model_name="deepseek-r1-distill-qwen-7b",
53+
base_url="http://localhost:1234/v1",
54+
temperature=0.1,
55+
)
56+
return llm
57+
58+
59+
def reset_chat():
60+
st.session_state.messages = []
61+
gc.collect()
62+
63+
64+
def display_pdf(file):
65+
st.markdown("### PDF Preview")
66+
base64_pdf = base64.b64encode(file.read()).decode("utf-8")
67+
68+
# Embedding PDF in HTML
69+
pdf_display = f"""<iframe src="data:application/pdf;base64,{base64_pdf}" width="400" height="100%" type="application/pdf"
70+
style="height:100vh; width:100%"
71+
>
72+
</iframe>"""
73+
74+
# Displaying File
75+
st.markdown(pdf_display, unsafe_allow_html=True)
76+
77+
# Function to initialize the workflow with uploaded documents
78+
79+
80+
def initialize_workflow(file_path):
81+
with st.spinner("Loading documents and initializing the workflow..."):
82+
documents = SimpleDirectoryReader(file_path).load_data()
83+
84+
client = qdrant_client.QdrantClient(
85+
host="localhost",
86+
port=6333
87+
)
88+
89+
vector_store = QdrantVectorStore(client=client, collection_name="test")
90+
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-large-en-v1.5")
91+
Settings.embed_model = embed_model
92+
storage_context = StorageContext.from_defaults(
93+
vector_store=vector_store)
94+
index = VectorStoreIndex.from_documents(
95+
documents,
96+
storage_context=storage_context,
97+
)
98+
99+
workflow = CorrectiveRAGWorkflow(
100+
index=index,
101+
firecrawl_api_key=os.environ["FIRECRAWL_API_KEY"],
102+
verbose=True,
103+
timeout=60,
104+
llm=load_llm()
105+
)
106+
107+
st.session_state.workflow = workflow
108+
return workflow
109+
110+
# Function to run the async workflow
111+
112+
113+
async def run_workflow(query):
114+
# Capture stdout to get the workflow logs
115+
f = io.StringIO()
116+
with redirect_stdout(f):
117+
result = await st.session_state.workflow.run(query_str=query)
118+
119+
# Get the captured logs and store them
120+
logs = f.getvalue()
121+
if logs:
122+
st.session_state.workflow_logs.append(logs)
123+
124+
return result
125+
126+
# Sidebar for document upload
127+
with st.sidebar:
128+
# Add FireCrawl logo and Configuration header in the same line
129+
col1, col2 = st.columns([1, 3])
130+
with col1:
131+
# Add vertical space to align with header
132+
st.write("")
133+
st.image("./assets/firecrawl_logo.png", width=65)
134+
with col2:
135+
st.header("Firecrawl Configuration")
136+
st.write("Deep Web Search")
137+
138+
# Add hyperlink to get API key
139+
st.markdown("[Get your API key](https://www.firecrawl.dev/signin/signup)",
140+
unsafe_allow_html=True)
141+
142+
firecrawl_api_key = st.text_input(
143+
"Enter your Firecrawl API Key", type="password")
144+
145+
# Store API key as environment variable
146+
if firecrawl_api_key:
147+
os.environ["FIRECRAWL_API_KEY"] = firecrawl_api_key
148+
st.success("API Key stored successfully!")
149+
150+
st.header("Add your documents!")
151+
152+
uploaded_file = st.file_uploader("Choose your `.pdf` file", type="pdf")
153+
154+
if uploaded_file:
155+
try:
156+
with tempfile.TemporaryDirectory() as temp_dir:
157+
file_path = os.path.join(temp_dir, uploaded_file.name)
158+
159+
with open(file_path, "wb") as f:
160+
f.write(uploaded_file.getvalue())
161+
162+
file_key = f"{session_id}-{uploaded_file.name}"
163+
st.write("Indexing your document...")
164+
165+
if file_key not in st.session_state.get('file_cache', {}):
166+
# Initialize workflow with the uploaded document
167+
workflow = initialize_workflow(temp_dir)
168+
st.session_state.file_cache[file_key] = workflow
169+
else:
170+
st.session_state.workflow = st.session_state.file_cache[file_key]
171+
172+
# Inform the user that the file is processed and Display the PDF uploaded
173+
st.success("Ready to Chat!")
174+
display_pdf(uploaded_file)
175+
except Exception as e:
176+
st.error(f"An error occurred: {e}")
177+
st.stop()
178+
179+
# Main chat interface
180+
col1, col2 = st.columns([6, 1])
181+
182+
with col1:
183+
# Removed the original header
184+
st.markdown("<h2 style='color: #0066cc;'>⚙️ Corrective RAG agentic workflow</h2>",
185+
unsafe_allow_html=True)
186+
# Replace text with image and subtitle styling
187+
st.markdown("<div style='display: flex; align-items: center; gap: 10px;'><span style='font-size: 28px; color: #666;'>Powered by LlamaIndex</span><img src='data:image/png;base64,{}' width='50'></div>".format(
188+
base64.b64encode(open("./assets/images.jpeg", "rb").read()).decode()
189+
), unsafe_allow_html=True)
190+
191+
with col2:
192+
st.button("Clear ↺", on_click=reset_chat)
193+
194+
# Display chat messages from history on app rerun
195+
for i, message in enumerate(st.session_state.messages):
196+
with st.chat_message(message["role"]):
197+
st.markdown(message["content"])
198+
199+
# If this is a user message and there are logs associated with it
200+
# Display logs AFTER the user message but BEFORE the next assistant message
201+
if message["role"] == "user" and "log_index" in message and i < len(st.session_state.messages) - 1:
202+
log_index = message["log_index"]
203+
if log_index < len(st.session_state.workflow_logs):
204+
with st.expander("View Workflow Execution Logs", expanded=False):
205+
st.code(
206+
st.session_state.workflow_logs[log_index], language="text")
207+
208+
# Accept user input
209+
if prompt := st.chat_input("Ask a question about your documents..."):
210+
# Add user message to chat history with placeholder for log index
211+
log_index = len(st.session_state.workflow_logs)
212+
st.session_state.messages.append(
213+
{"role": "user", "content": prompt, "log_index": log_index})
214+
215+
# Display user message in chat message container
216+
with st.chat_message("user"):
217+
st.markdown(prompt)
218+
219+
if st.session_state.workflow:
220+
# Run the async workflow
221+
result = asyncio.run(run_workflow(prompt))
222+
223+
# Display the workflow logs in an expandable section OUTSIDE and BEFORE the assistant chat bubble
224+
if log_index < len(st.session_state.workflow_logs):
225+
with st.expander("View Workflow Execution Logs", expanded=False):
226+
st.code(
227+
st.session_state.workflow_logs[log_index], language="text")
228+
229+
# Display assistant response in chat message container
230+
with st.chat_message("assistant"):
231+
if st.session_state.workflow:
232+
message_placeholder = st.empty()
233+
full_response = ""
234+
235+
result = result.response
236+
237+
# Stream the response word by word
238+
words = result.split()
239+
for i, word in enumerate(words):
240+
full_response += word + " "
241+
message_placeholder.markdown(full_response + "▌")
242+
# Add a delay between words
243+
if i < len(words) - 1: # Don't delay after the last word
244+
time.sleep(0.1)
245+
246+
# Display final response without cursor
247+
message_placeholder.markdown(full_response)
248+
else:
249+
full_response = "Please upload a document first to initialize the workflow."
250+
st.markdown(full_response)
251+
252+
# Add assistant response to chat history
253+
st.session_state.messages.append(
254+
{"role": "assistant", "content": full_response})
108 KB
Loading
73.9 KB
Loading

firecrawl-agent/assets/images.jpeg

3.85 KB
Loading

firecrawl-agent/pyproject.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[project]
2+
name = "firecrawl-agent"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.11"
7+
dependencies = [
8+
"firecrawl-py>=2.4.1",
9+
"ipython>=9.2.0",
10+
"llama-index>=0.12.33",
11+
"llama-index-embeddings-fastembed>=0.3.1",
12+
"llama-index-llms-lmstudio>=0.3.0",
13+
"llama-index-readers-web>=0.3.9",
14+
"llama-index-vector-stores-qdrant>=0.6.0",
15+
"lmstudio>=1.3.0",
16+
"qdrant-client>=1.14.2",
17+
"streamlit>=1.44.1",
18+
]

0 commit comments

Comments
 (0)