tree-sitter: improve update script to fetch all available grammars

The update script would only fetch the few grammars listed in the
tree-sitter repository previously. But the tree-sitter github orga has
a rather large amount of officially supported grammars.

Thus we change the script to query the github APIs for repositories
instead (up to 100 this is supported without paging).

Since the repository list also contains some that are not grammars,
there is a bash script which lists all repos we are aware of and the
ones we want to ignore. It will make sure we don’t forget any
repositories in the future, by comparing to the actual list with jq.
This commit is contained in:
Profpatsch 2020-12-24 03:01:10 +01:00
parent a629257ec5
commit c0a4b41afe

View file

@ -3,11 +3,70 @@
, src }:
let
# print all the grammar names mentioned in the fetch-fixtures script
getGrammarNames = writeShellScript "get-grammars.sh" ''
# check in the list of grammars, whether we know all of them.
checkKnownGrammars = writeShellScript "get-grammars.sh" ''
set -euo pipefail
sed -ne 's/^fetch_grammar \(\S*\).*$/\1/p' \
${src}/script/fetch-fixtures
known='
[ "tree-sitter-javascript"
, "tree-sitter-c"
, "tree-sitter-swift"
, "tree-sitter-json"
, "tree-sitter-cpp"
, "tree-sitter-ruby"
, "tree-sitter-razor"
, "tree-sitter-go"
, "tree-sitter-c-sharp"
, "tree-sitter-python"
, "tree-sitter-typescript"
, "tree-sitter-rust"
, "tree-sitter-bash"
, "tree-sitter-php"
, "tree-sitter-java"
, "tree-sitter-scala"
, "tree-sitter-ocaml"
, "tree-sitter-julia"
, "tree-sitter-agda"
, "tree-sitter-fluent"
, "tree-sitter-html"
, "tree-sitter-haskell"
, "tree-sitter-regex"
, "tree-sitter-css"
, "tree-sitter-verilog"
, "tree-sitter-jsdoc"
, "tree-sitter-ql"
]'
ignore='
[ "tree-sitter"
, "tree-sitter-cli"
, "tree-sitter-embedded-template"
${/*this is the haskell language bindings, tree-sitter-haskell is the grammar*/""}
, "haskell-tree-sitter"
${/*this is the ruby language bindings, tree-sitter-ruby is the grammar*/""}
, "ruby-tree-sitter"
${/*this is the (unmaintained) rust language bindings, tree-sitter-rust is the grammar*/""}
, "rust-tree-sitter"
${/*this is the nodejs language bindings, tree-sitter-javascript is the grammar*/""}
, "node-tree-sitter"
${/*this is the python language bindings, tree-sitter-python is the grammar*/""}
, "py-tree-sitter"
${/*afl fuzzing for tree sitter*/""}
, "afl-tree-sitter"
${/*archived*/""}
, "highlight-schema"
${/*website*/""}
, "tree-sitter.github.io"
]'
res=$(${jq}/bin/jq \
--argjson known "$known" \
--argjson ignore "$ignore" \
'. - ($known + $ignore)' \
)
if [ ! "$res" == "[]" ]; then
echo "These repositories are neither known nor ignored:" 1>&2
echo "$res" 1>&2
exit 1
fi
printf '%s' "$known"
'';
# TODO
@ -22,7 +81,7 @@ let
res=$(${curl}/bin/curl \
--silent \
"https://api.github.com/repos/${urlEscape owner}/$(${urlEscapeSh} "$repo")/releases/latest")
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message')" =~ "rate limit" ]]; then
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
echo "rate limited" >&2
fi
release=$(printf "%s" "$res" | ${jq}/bin/jq '.tag_name')
@ -34,6 +93,21 @@ let
echo "$release"
'';
# find the latest repos of a github organization
latestGithubRepos = { orga }: writeShellScript "latest-github-repos" ''
set -euo pipefail
res=$(${curl}/bin/curl \
--silent \
'https://api.github.com/orgs/${orga}/repos?per_page=100')
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
echo "rate limited" >&2
fi
printf "%s" "$res" | ${jq}/bin/jq 'map(.name)' \
|| echo "failed $res"
'';
# update one tree-sitter grammar repo and print their nix-prefetch-git output
updateGrammar = { owner }: writeShellScript "update-grammar.sh" ''
set -euo pipefail
@ -49,18 +123,22 @@ let
update-all-grammars = writeShellScript "update-all-grammars.sh" ''
set -euo pipefail
grammarNames=$(${getGrammarNames})
echo "fetching list of grammars" 1>&2
grammars=$(${latestGithubRepos { orga = "tree-sitter"; }})
echo "checking against the list of grammars we know" 1>&2
knownGrammars=$(printf '%s' "$grammars" | ${checkKnownGrammars})
# change the json list into a item-per-line bash format
grammarNames=$(printf '%s' "$knownGrammars" | ${jq}/bin/jq --raw-output '.[]')
outputDir="${toString ./.}/grammars"
mkdir -p "$outputDir"
updateCommand=$(printf \
'${updateGrammar { owner = "tree-sitter"; }} "$1" > "%s/$1.json"' \
"$outputDir")
printf '%s' "$grammarNames" \
| ${xe}/bin/xe printf "tree-sitter-%s\n" {} \
| ${xe}/bin/xe -j2 -s "$updateCommand"
( echo "{"
printf '%s' "$grammarNames" \
| ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./tree-sitter-%s.json));\n" "$1" "$1"'
| ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./%s.json));\n" "$1" "$1"'
echo "}" ) \
> "$outputDir/default.nix"
'';