unconv · MohamedBasueny · Dec 29, 2023
diff --git a/.env.example b/.env.example
@@ -0,0 +1,5 @@
+OPENAI_API_KEY = 
+LANGCHAIN_PROJECT=
+LANGCHAIN_API_KEY=
+LANGCHAIN_TRACING_V2=
+ELEVENLABS_KEY=
diff --git a/README.md b/README.md
@@ -1,23 +1,10 @@
-# Shortrocity
+# short-videos
+generate short videos using unstructured , langchain , cv2 , elevenlabs , OpenAi 
+# what is the difference ?
+    1.this app is using langchain 
+    2. using Ai models to extract the narration and imgaes descriptions instead of heurstics 
+    3. some tests for matching lengths 
+    4. using langsmith for montoring 
+    5. the app accepts a url for any html page instead of a manual way to copy text 
+    5. [future] deploy as a restapi using langserve 
 
-Shortrocity is a tool for making AI generated short videos ("shorts" or "reels") with a ChatGPT generated script, narrated by ElevenLabs or OpenAI text-to-speech. DALL-E 3 generated background images are also added to the background.
-
-## Quick Start
-
-First, add your API-keys to the environment:
-
-```console
-$ export OPENAI_API_KEY=YOUR_OPENAI_API_KEY
-$ export ELEVENLABS_API_KEY=YOUR_ELEVENLABS_API_KEY
-```
-
-Then, put your source content in a file, for example `source.txt` and run the `main.py`:
-
-```console
-$ ./main.py source.txt
-Generating script...
-Generating narration...
-Generating images...
-Generating video...
-DONE! Here's your video: shorts/1701788183/short.avi
-``````
diff --git a/app/__init__.py b/app/__init__.py
diff --git a/app/audio.py b/app/audio.py
@@ -0,0 +1,33 @@
+"""
+create a narration audio out of a text 
+"""
+from data_parser import parse_data
+from templates import template_images , template_narrator
+from utils import create_dict_pairs
+from elevenlabs import set_api_key , generate , save 
+from load_dotenv import load_dotenv
+load_dotenv()
+import os 
+set_api_key(os.getenv("ELEVENLABS_KEY"))
+
+def concatenate_text() :
+    text=""
+    parsed_narrations = parse_data(template_narrator)
+    parsed_images = parse_data(template_images)
+    l = create_dict_pairs(parsed_narrations,parsed_images)
+    for d in l :
+        text += d["text"]+"\n\n"
+    return text
+
+def generate_audio(text):
+    audio = generate(
+        text=text , 
+        voice="T7QGPtToiqH4S8VlIkMJ",
+        model="eleven_multilingual_v2"
+    )
+    save(audio = audio , filename="./data/audio.mp3")
+
+
+generate_audio(
+    text=concatenate_text()
+)
diff --git a/app/data_parser.py b/app/data_parser.py
@@ -0,0 +1,55 @@
+from langchain.output_parsers import CommaSeparatedListOutputParser
+from langchain.prompts import PromptTemplate , SystemMessagePromptTemplate , ChatPromptTemplate
+from langchain_community.chat_models import ChatOpenAI
+from narration import call 
+from langchain.schema.messages import SystemMessage
+from utils import clean 
+from templates import template_images , template_narrator
+
+# context = call()
+# output_parser = CommaSeparatedListOutputParser()
+# format_instructions = output_parser.get_format_instructions()
+# print(format_instructions)
+
+# prompt = PromptTemplate(
+#     template=template,
+#     input_variables=["context"],
+# )
+
+# chat_template = ChatPromptTemplate.from_messages(
+#     [SystemMessagePromptTemplate.from_template(template=template_images)])
+
+# model = ChatOpenAI(temperature=0 ,model_name="gpt-3.5-turbo-1106" , )
+
+# input = chat_template.format_messages(context=context)
+
+# messages = [
+#     SystemMessage(_input)
+# ]
+# print(input)
+
+# chain =  chat_template | model
+# output = chain.invoke({"context":context}).content
+# clean(data=output)
+# print(f"model output is\n\n {output} , \nfirst element is \n   {output[0]} , \n type is {type(output)}")
+# print(output.split("\n")[0])
+# text = output_parser.parse(output)
+# print(context)
+# print("\n\n***********\n\n")
+# print(
+
+
+
+def parse_data(template:str)->list:
+    context = call()
+    chat_template = ChatPromptTemplate.from_messages(
+        [SystemMessagePromptTemplate.from_template(template=template)])
+
+    model = ChatOpenAI(temperature=0 ,model_name="gpt-3.5-turbo-1106"  )
+
+    input = chat_template.format_messages(context=context)
+    chain =  chat_template | model
+    output = chain.invoke({"context":context}).content
+    cleaned_list = clean(data=output)
+    return cleaned_list
+
diff --git a/app/generate_images.py b/app/generate_images.py
@@ -0,0 +1,45 @@
+"""
+a text to image module . 
+responsible for generating images by using DALLE-3 openai model 
+"""
+import base64
+from data_parser import parse_data
+from templates import template_images , template_narrator
+from utils import create_dict_pairs
+import os 
+
+
+def get_images_descriptions() :
+    images=[]
+    parsed_narrations = parse_data(template_narrator)
+    parsed_images = parse_data(template_images)
+    l = create_dict_pairs(parsed_narrations,parsed_images)
+    for d in l :
+        images.append(d["image"])
+
+    return images 
+
+def generate_images(images=get_images_descriptions()):
+    from openai import OpenAI
+    client = OpenAI()
+    for i , img in enumerate(images) : 
+        response = client.images.generate(
+        model="dall-e-3",
+        prompt=img,
+        size="1024x1024",
+        quality="standard",
+        n=1,
+        response_format="b64_json"
+        )
+        image_b64 = response.data[0].b64_json
+
+        if not os.path.exists("./data/images/"):
+            os.makedirs("./data/images/")
+
+
+        with open(f"./data/images/image_{i}.webp" , "wb") as f :
+            f.write(base64.b64decode(image_b64))
+
+
+
+generate_images()
diff --git a/app/main.py b/app/main.py
@@ -0,0 +1,12 @@
+"""
+main file to handle all process 
+it should call all the main functions that handle the following scenario 
+1.generating data 
+2.clean the data generated 
+3.get structured data 
+4.generate audio 
+5.generate images
+6.create the overall video 
+"""
+from app.data_parser  import parse_data 
+from app.templates import * 
diff --git a/app/narration.py b/app/narration.py
@@ -0,0 +1,31 @@
+"""
+create a narration text out of a raw text from a given website or an article 
+ideas:
+1. scrape a website text , filter it and generate an add out of it . 
+2. create a youtube shorts app generator 
+"""
+from load_dotenv import load_dotenv
+load_dotenv()
+import os 
+from utils import load_html_text
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import ChatPromptTemplate
+from templates import template 
+
+def call():
+    # prompt = PromptTemplate.from_template(template)
+    # prompt.format(context=load_html_text())
+    context = load_html_text()
+    # print(prompt.format(context))
+    chat_prompt = ChatPromptTemplate.from_messages([
+        ("system", template)
+    ])
+    chain = chat_prompt | ChatOpenAI() 
+    response = chain.invoke({"context":context })
+
+    with open("./data/response.txt" , "w") as f : 
+        f.write(response.content)
+    print(response.content)
+    return response.content
+
+# print(call())
diff --git a/app/templates.py b/app/templates.py
@@ -0,0 +1,89 @@
+from langchain.prompts import PromptTemplate
+
+from langchain.prompts.chat import ChatPromptTemplate
+
+template = """
+Craft a compelling advertisement script as if you were a seasoned content creator expert.
+Your task is to create a persuasive and engaging promotional piece tailored to a specific context provided.
+Consider the target audience, key messaging, and the overall tone to captivate and drive interest effectively.
+Dive into the realm of creativity, utilizing your expertise to seamlessly blend innovation and consumer appeal into a seamless promotional narrative.
+Remember to provide details that showcase the uniqueness of the product or service while delivering a memorable and impactful call-to-action.
+###############################
+your script will be used as a short reel/video along with images that describes the text . 
+your script will be passed to a text to speech model to convert it into an audio 
+use the following examples as a refrence 
+don't output music indicator like [Upbeat music playing] ,[Upbeat music fades out]'
+**********************************************
+examples :
+
+Example Pair 1:
+
+Image Description 1:
+"A vibrant can of Bolt Boost energy drink surrounded by dynamic lightning bolts, symbolizing energy and power."
+Ad Text 1:
+"Unleash the Power Within! Introducing Bolt Boost – the energy drink that fuels your ambition. Tackle your day with vitality and focus. Time to elevate your energy game!"
+
+
+Image Description 2:
+"An energetic individual conquering challenges with a glowing aura, holding a can of Bolt Boost, surrounded by a vibrant, active environment."
+Ad Text 2:
+"Revitalize your day with Bolt Boost! Packed with natural ingredients and a burst of flavor, this energy elixir keeps you at your peak. Elevate your performance, embrace the Bolt Boost experience!"
+
+Image Description 3:
+"A creative workspace with Bolt Boost cans scattered around, featuring a laptop with artistic tools, showcasing the synergy between the energy drink and creative endeavors."
+Ad Text 3:
+"Fuel Your Passion! Bolt Boost, the ultimate energy companion for creators. Whether you're a designer, writer, or artist, power up your creativity and break through boundaries. Unleash your potential!"
+
+
+Image Description 4:
+"A visually stunning scene of a creative mind at work, surrounded by Bolt Boost cans and a burst of vibrant colors, highlighting the fusion of creativity and energy."
+Ad Text 4:
+"Create, Energize, Repeat! Bolt Boost – the choice of innovators. Sip on inspiration and crush creative blocks. Elevate your craft with the energy that matches your ambition."
+
+************************************************
+context:{context}
+"""
+
+template_narrator = """
+scrape all the narrator text from the following context 
+use the examples blow as a refrence 
+examples : 
+\n\nNarrator: "Passion. Quality. Commitment. At McDonald\'s, we\'re passionate about our food, always striving to provide you with the best dining experience possible.
+"\n\n[Images of fresh ingredients being prepared and cooked]
+\n\nNarrator: "From our balanced options in the Happy Meal to our Quarter Pounder burgers made with 100% fresh beef cooked to order, we\'re committed to serving you quality food.
+"\n\n[Close-up shots of various menu items]
+ your output should be like below : 
+
+Passion. Quality. Commitment. At McDonald\'s, we\'re passionate about our food, always striving to provide you with the best dining experience possible.\n
+From our balanced options in the Happy Meal to our Quarter Pounder burgers made with 100% fresh beef cooked to order, we\'re committed to serving you quality food.\n
+context:{context}.
+Your response should be single values seperated by a new line \n
+append \n to every value you parse 
+don't forget any narration 
+the count of narrations extracted should be the same as image descriptions
+"""
+
+template_images = """
+scrape all the images description from the following context 
+images description are always enclosed in square brackets [] 
+every Narrator text is followed by an image description . please scrape all the images 
+use the examples blow as a refrence 
+examples : 
+\n\nNarrator: "Passion. Quality. Commitment. At McDonald\'s, we\'re passionate about our food, always striving to provide you with the best dining experience possible.
+"\n\n[Images of fresh ingredients being prepared and cooked]
+\n\nNarrator: "From our balanced options in the Happy Meal to our Quarter Pounder burgers made with 100% fresh beef cooked to order, we\'re committed to serving you quality food.
+"\n\n[Close-up shots of various menu items]
+
+ your output should be like below : 
+
+fresh ingredients being prepared and cooked\n
+Close-up shots of various menu items\n
+
+context:{context}.
+Your response should be single values seperated by a new line \n
+append \n to every value you parse 
+don't enclude any image indicator in the output . just extract all the image description.
+don't add any text to it 
+the count of narrations extracted should be the same as image descriptions
+don't forget any image.
+"""
diff --git a/app/utils.py b/app/utils.py
@@ -0,0 +1,52 @@
+
+def download_html_from_url(url):
+    import requests
+    response = requests.get(url)
+    if response.status_code == 200:
+        html_content = response.text
+        # Now 'html_content' contains the HTML of the webpage
+        # print(html_content)
+        with open("./data/file.html" , "w") as f:
+            f.write(html_content)
+    else:
+        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
+
+
+def load_html_text(url:str="https://www.mcdonalds.com/us/en-us/about-our-food.html") :
+    """
+    accepts a link to an html website 
+    returns : A Document langchain object with text scrapped 
+    """
+    from langchain.document_loaders import BSHTMLLoader , UnstructuredHTMLLoader
+    download_html_from_url(url)
+    loader = UnstructuredHTMLLoader(file_path="./data/file.html")
+    data = loader.load()
+    # print(f"type of object : {type(data)} & data is : {data}")
+    with open("./data/file.txt" , "w") as f:
+        f.write(data[0].page_content)
+    return data[0].page_content
+
+# load_html_text()
+
+# load_html_text(url="https://www.mcdonalds.com/us/en-us/about-our-food.html")
+
+def clean(data:str)->list :
+    l=[]
+    data = data.split("\n")
+    for d in data : 
+        cleaned = d.strip().replace('"' , '').replace("\n",'').replace('[' ,'').replace(']','')
+        l.append(cleaned)
+        print(cleaned)
+    print(l)
+    return l 
+
+def create_dict_pairs(text:list , images:list) -> dict :
+    l = []
+    for tex , img in zip(text,images):
+        i={}
+        if len(tex) > 0  and len(img) > 0 : 
+            i["text"] , i["image"]= tex , img 
+            l.append(i)
+    print(l)
+    return l
+
diff --git a/data/audio.mp3 b/data/audio.mp3