<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Evaluation on Hi, I&#39;m Muhammad Amal</title>
    <link>https://muhammadamal.my.id/tags/evaluation/</link>
    <description>Recent content in Evaluation on Hi, I&#39;m Muhammad Amal</description>
    <generator>Hugo</generator>
    <language>en-us</language>
    <lastBuildDate>Wed, 29 Jan 2025 09:00:00 +0700</lastBuildDate>
    <atom:link href="https://muhammadamal.my.id/tags/evaluation/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Benchmarking SLMs for Your Use Case, From Lmeval to Custom Suites</title>
      <link>https://muhammadamal.my.id/blog/benchmarking-slms-for-your-use-case-lmeval-to-custom/</link>
      <pubDate>Wed, 29 Jan 2025 09:00:00 +0700</pubDate>
      <guid>https://muhammadamal.my.id/blog/benchmarking-slms-for-your-use-case-lmeval-to-custom/</guid>
      <description>Public leaderboards lie about your task. Build a benchmark that measures what your users actually need.</description>
    </item>
    <item>
      <title>Evaluating LLM Agents, From Vibes to Regression Suites</title>
      <link>https://muhammadamal.my.id/blog/evaluating-llm-agents/</link>
      <pubDate>Fri, 24 May 2024 09:00:00 +0700</pubDate>
      <guid>https://muhammadamal.my.id/blog/evaluating-llm-agents/</guid>
      <description>A practical agent evaluation system with deterministic checks, LLM-as-judge rubrics, and the regression discipline that survives model upgrades.</description>
    </item>
    <item>
      <title>Evaluating RAG, Beyond Vibes-Based Testing</title>
      <link>https://muhammadamal.my.id/blog/rag-evaluation-ragas-trulens-deepeval/</link>
      <pubDate>Mon, 26 Feb 2024 09:00:00 +0700</pubDate>
      <guid>https://muhammadamal.my.id/blog/rag-evaluation-ragas-trulens-deepeval/</guid>
      <description>Ragas, TruLens, DeepEval — measuring RAG quality. Faithfulness, context precision, answer relevance. CI integration without LLM-as-judge bills.</description>
    </item>
    <item>
      <title>Putting a RAG Evaluation Pipeline in CI, The Setup I Actually Use</title>
      <link>https://muhammadamal.my.id/blog/rag-evaluation-pipeline-ci/</link>
      <pubDate>Mon, 20 Nov 2023 09:00:00 +0700</pubDate>
      <guid>https://muhammadamal.my.id/blog/rag-evaluation-pipeline-ci/</guid>
      <description>A practical RAG eval setup wired into CI — retrieval and generation metrics, golden questions, and catching silent regressions.</description>
    </item>
  </channel>
</rss>
